## **1.Mount Drive**

In [1]:
# =============================
# PHASE 3 — CELL 1: Mount Drive
# =============================

import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Drive mounted successfully.")
else:
    print("Running locally — no Drive mount required.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive mounted successfully.


# **2.Install Required Libs**

In [2]:
# ========================================
# PHASE 3 — CELL 2: Install Required Libs
# ========================================

# Run this cell ONLY if you're in Google Colab.
# Skip on local machine if libs already installed.

!pip install -q tldextract python-whois beautifulsoup4 textblob textstat scikit-learn joblib spacy

# ensure spaCy model exists
import importlib, subprocess
try:
    importlib.import_module("en_core_web_sm")
except:
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])

print("All required libraries installed & verified.")

All required libraries installed & verified.


# **3.Import Dependencies**

In [3]:
# ====================================
# PHASE 3 — CELL 3: Import Dependencies
# ====================================

import os
import re
import time
import joblib
import random
import requests
import tldextract
import datetime
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from textblob import TextBlob
import textstat
import spacy

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample

nlp = spacy.load("en_core_web_sm")

print("Phase 3 dependencies successfully imported.")

Phase 3 dependencies successfully imported.


# **4.Full Retraining Code**

In [4]:
# =====================================
# PHASE 3 — CELL 4: FULL RETRAINING CODE
# =====================================

# ---------------------------
# Step 1: Real website list
# ---------------------------
real_sites = [
 "https://amazon.in", "https://amazon.com", "https://flipkart.com", "https://myntra.com",
 "https://ajio.com", "https://meesho.com", "https://nykaa.com", "https://paytm.com",
 "https://bigbasket.com", "https://reliancedigital.in", "https://snapdeal.com",
 "https://swiggy.com", "https://zomato.com", "https://ola.in", "https://uber.com"
]


# ---------------------------
# Step 2: Fake website generator
# ---------------------------
fake_patterns = ["-discount", "-sale", "-store", "-offer", "-deal", "-promo"]
suspicious_tlds = ["xyz", "shop", "online", "top", "buzz"]
brands = ["amazonn", "flipkarrt", "myntraa", "bestbuys", "shopkart", "superdeal"]

fake_sites = []
for i in range(300):
    b = random.choice(brands)
    p = random.choice(fake_patterns)
    t = random.choice(suspicious_tlds)
    fake_sites.append(f"https://{b}{p}{random.randint(1,999)}.{t}")


# ---------------------------
# Step 3: Text scraping
# ---------------------------
def fetch_text(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0 ScamScanBot"}
        r = requests.get(url, timeout=6, headers=headers)
        soup = BeautifulSoup(r.text, "html.parser")
        return soup.get_text(separator=" ")
    except:
        # fallback for synthetic fakes
        return "LIMITED OFFER! Buy now and get 90% discount. Free shipping."


# ---------------------------
# Step 4: Feature extraction
# ---------------------------
def preprocess(text):
    if not text:
        return ""
    text = re.sub(r"[^A-Za-z0-9\s%-]", " ", text)
    text = text.lower()
    return re.sub(r"\s+", " ", text).strip()


def extract_features(url, raw):
    clean = preprocess(raw)

    d = tldextract.extract(url)

    return {
        "url": url,
        "raw_text": raw,
        "clean_text": clean,
        "url_length": len(url),
        "num_digits": sum(c.isdigit() for c in url),
        "num_special_chars": len(re.findall(r"[\W_]", url)),
        "has_https": int(url.startswith("https")),
        "domain_len": len(d.domain),
        "suffix_len": len(d.suffix),
        "domain_age_days": -1,   # WHOIS skipped to avoid rate limits
        "text_length": len(clean.split()),
        "scam_keyword_score": sum(k in clean for k in ["discount","offer","free","promo","sale"]),
        "entity_count": len(nlp(clean).ents) if clean else 0
    }


# ---------------------------
# Step 5: Build Dataset
# ---------------------------
records = []

# REAL sites
for url in real_sites:
    raw = fetch_text(url)
    feat = extract_features(url, raw)
    feat["label"] = 0
    records.append(feat)
    time.sleep(1)

# FAKE sites
for url in fake_sites:
    raw = fetch_text(url)
    feat = extract_features(url, raw)
    feat["label"] = 1
    records.append(feat)

df = pd.DataFrame(records)
print("Dataset created:", df.shape)


# ---------------------------
# Step 6: TF-IDF Vectorization
# ---------------------------
tfidf = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf.fit_transform(df["clean_text"])


# ---------------------------
# Step 7: Final training matrix
# ---------------------------
numeric_cols = [
    "url_length", "num_digits", "num_special_chars", "has_https",
    "domain_len", "suffix_len", "domain_age_days",
    "text_length", "scam_keyword_score", "entity_count"
]

X_num = df[numeric_cols].values
from scipy.sparse import hstack
X = hstack([X_num, tfidf_matrix])
y = df["label"].values


# ---------------------------
# Step 8: Train-test split
# ---------------------------
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


# ---------------------------
# Step 9: Train RandomForest
# ---------------------------
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)


# ---------------------------
# Step 10: Metrics
# ---------------------------
print("\n===== MODEL EVALUATION =====")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


# ---------------------------
# Step 11: Save Model + TF-IDF + Dataset
# ---------------------------
joblib.dump(rf, "scamscan_model_retrained.pkl")
joblib.dump(tfidf, "tfidf_vectorizer_retrained.pkl")
df.to_csv("df_final_retrained.csv", index=False)

print("\nModels & dataset saved successfully!")

Dataset created: (315, 14)

===== MODEL EVALUATION =====
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00        75

    accuracy                           1.00        79
   macro avg       1.00      1.00      1.00        79
weighted avg       1.00      1.00      1.00        79


Models & dataset saved successfully!
