# **1.Mount Google Drive**

In [1]:
# ============================================
# ðŸ“Œ PHASE 2 â€” CELL 1: Mount Google Drive
# ============================================

from google.colab import drive
drive.mount('/content/drive')

print("Drive mounted successfully!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive mounted successfully!


# **2.Install Required Libraries**

In [2]:
# ============================================
# ðŸ“Œ PHASE 2 â€” CELL 2: Install Required Libraries
# ============================================

!pip install scikit-learn joblib --quiet

print("Libraries installed successfully!")

Libraries installed successfully!


# **3.Import Dependencies**

In [3]:
# ============================================
# ðŸ“Œ PHASE 2 â€” CELL 3: Import Dependencies
# ============================================

import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, classification_report
)
from scipy.sparse import hstack

print("All Phase-2 dependencies imported successfully!")

All Phase-2 dependencies imported successfully!


# **4.FULL ML TRAINING PIPELINE**

In [4]:
# ============================================================
# ðŸ“Œ PHASE 2 â€” CELL 4: FULL ML TRAINING PIPELINE
# ============================================================

# ---------------------------
# 1. Load Phase-1 Outputs
# ---------------------------

base_dir = "/content/drive/MyDrive/scamscan_project"

df_final = pd.read_csv(f"{base_dir}/outputs/df_final.csv")
tfidf = joblib.load(f"{base_dir}/models/tfidf_vectorizer.pkl")
tfidf_matrix = joblib.load(f"{base_dir}/models/tfidf_matrix.pkl")

print("Loaded df_final:", df_final.shape)
print("Loaded TF-IDF matrix:", tfidf_matrix.shape)


# ---------------------------
# 2. Select Numeric Features
# ---------------------------

numeric_features = df_final[[
    "url_length",
    "num_digits",
    "num_special_chars",
    "has_https",
    "domain_age_days",
    "text_length",
    "scam_keyword_score",
    "sentiment",
    "readability",
    "entity_count"
]].values

labels = df_final["label"].values


# ---------------------------
# 3. Combine Numeric + TF-IDF
# ---------------------------

X = hstack([numeric_features, tfidf_matrix])
y = labels

print("Final feature matrix shape:", X.shape)


# ---------------------------
# 4. Train/Test Split
# ---------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


# ---------------------------
# 5. Train Model
# ---------------------------

model = RandomForestClassifier(
    n_estimators=250,
    max_depth=25,
    random_state=42
)

model.fit(X_train, y_train)
print("\nðŸŽ‰ Model training complete!")


# ---------------------------
# 6. Evaluate Model
# ---------------------------

y_pred = model.predict(X_test)

print("\nðŸ“Œ MODEL PERFORMANCE")
print("-------------------------")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))

print("\nClassification Report:\n", classification_report(y_test, y_pred))


# ---------------------------
# 7. Save Model
# ---------------------------

model_path = f"{base_dir}/models/scamscan_model.pkl"
joblib.dump(model, model_path)

print("\nðŸ’¾ Model saved successfully at:", model_path)
print("\nðŸš€ PHASE 2 COMPLETED SUCCESSFULLY!")

Loaded df_final: (200, 15)
Loaded TF-IDF matrix: (200, 37)
Final feature matrix shape: (200, 47)

ðŸŽ‰ Model training complete!

ðŸ“Œ MODEL PERFORMANCE
-------------------------
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        25
           1       1.00      1.00      1.00        25

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50


ðŸ’¾ Model saved successfully at: /content/drive/MyDrive/scamscan_project/models/scamscan_model.pkl

ðŸš€ PHASE 2 COMPLETED SUCCESSFULLY!
