In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

CSV_PATH = "malicious_phish.csv"
MODEL_PATH = "url_model.joblib"

def train_model():
    print("📦 Loading data...")
    df = pd.read_csv(CSV_PATH, encoding="latin1")  # avoid UTF-8 decode errors
    df.columns = df.columns.str.strip()  # remove accidental spaces

    # Ensure first col is URL and second col is label
    X = df.iloc[:, 0].astype(str)
    y = df.iloc[:, 1].astype(str)

    # Drop classes with fewer than 2 samples
    class_counts = y.value_counts()
    valid_classes = class_counts[class_counts >= 2].index
    mask = y.isin(valid_classes)
    X = X[mask]
    y = y[mask]

    print(f"Remaining classes after filtering: {y.nunique()}")

    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
    )

    print("🔍 Training model...")
    model = make_pipeline(
        TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 6), max_features=30000),
        RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
    )

    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print(f"✅ Accuracy: {acc:.2%}")

    joblib.dump({"model": model, "encoder": le}, MODEL_PATH)
    print(f"💾 Model saved to {MODEL_PATH}")

def predict_url(url):
    bundle = joblib.load(MODEL_PATH)
    model = bundle["model"]
    encoder = bundle["encoder"]

    pred = model.predict([url])[0]
    label = encoder.inverse_transform([pred])[0]
    return label

# ==== Example usage ====
train_model()
print("🔮 Prediction:", predict_url("https://paypal-login.example.com"))


📦 Loading data...
Remaining classes after filtering: 4
🔍 Training model...
