Cell 1 ‚Äî Install & Import

In [None]:
!pip uninstall -y jax jaxlib > /dev/null 2>&1 || true

import os
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from collections import Counter


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM,
    Dense, Dropout, BatchNormalization, Concatenate, SpatialDropout1D
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras import regularizers


SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

print(f"‚úî Libraries loaded. TensorFlow Version: {tf.__version__}")

Feature Extractor

In [None]:

def calculate_entropy(text):
    if not text: return 0
    length = len(text)
    counts = Counter(text)
    entropy = 0
    for count in counts.values():
        p = count / length
        entropy -= p * math.log2(p)
    return entropy

def extract_features(url):
    url = str(url)
    return [
        len(url),
        url.count('.'),
        url.count('-'),
        url.count('@'),
        url.count('?'),
        url.count('&'),
        url.count('='),
        url.count('_'),
        sum(c.isdigit() for c in url),
        calculate_entropy(url),
        1 if 'https' in url else 0,
        1 if 'http' in url else 0,
        1 if 'www' in url else 0,
    ]

print("‚úî Feature extraction functions ready")

Load CSVs

In [None]:
csv_files = sorted(glob.glob("urls_*.csv"))
print(f" ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå CSV: {len(csv_files)} ‡πÑ‡∏ü‡∏•‡πå")

dfs = []
for fp in csv_files:
    try:
        df_temp = pd.read_csv(fp, on_bad_lines='skip')
        dfs.append(df_temp)
    except Exception as e:
        print(f"‚ö† Skipping file {fp}: {e}")

if not dfs:
    raise ValueError("‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• CSV ‡πÄ‡∏•‡∏¢ ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏≠‡∏±‡∏õ‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå")

full_df = pd.concat(dfs, ignore_index=True)
full_df = full_df[["url", "label"]].dropna().drop_duplicates()

print(f" ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏î‡∏¥‡∏ö‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î: {len(full_df):,} ‡πÅ‡∏ñ‡∏ß")

SAMPLE_FRAC = 0.35
if len(full_df) > 100000:
    df = full_df.sample(frac=SAMPLE_FRAC, random_state=SEED)
    print(f"‡∏ï‡∏±‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏´‡∏•‡∏∑‡∏≠ {SAMPLE_FRAC*100}% ‡∏ï‡∏≤‡∏°‡πÄ‡∏á‡∏∑‡πà‡∏≠‡∏ô‡πÑ‡∏Ç")
else:
    df = full_df
    print("info ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ô‡πâ‡∏≠‡∏¢‡∏≠‡∏¢‡∏π‡πà‡πÅ‡∏•‡πâ‡∏ß ‡πÑ‡∏°‡πà‡∏ï‡∏±‡∏î‡∏≠‡∏≠‡∏Å")

print(f" Final Dataset Size: {len(df):,} Rows")
del full_df
import gc
gc.collect()

Preprocessing

In [None]:
print(" Processing data... (This might take a moment)")

urls = df["url"].astype(str).tolist()
labels_raw = df["label"].values

X_feat_raw = np.array([extract_features(u) for u in urls])
num_features = X_feat_raw.shape[1]

tokenizer = Tokenizer(char_level=True, lower=False, oov_token="<OOV>")
tokenizer.fit_on_texts(urls)
sequences = tokenizer.texts_to_sequences(urls)
vocab_size = len(tokenizer.word_index) + 1

MAX_LEN = 150
X_seq = pad_sequences(sequences, maxlen=MAX_LEN, padding="post")

le = LabelEncoder()
y = le.fit_transform(labels_raw)

X_seq_train, X_seq_test, X_feat_train, X_feat_test, y_train, y_test = train_test_split(
    X_seq, X_feat_raw, y, test_size=0.2, random_state=SEED, stratify=y
)

scaler = StandardScaler()
X_feat_train = scaler.fit_transform(X_feat_train)
X_feat_test = scaler.transform(X_feat_test)

print(" Preprocessing Complete!")
print(f"   Train shape: {X_seq_train.shape}")
print(f"   Vocab size: {vocab_size}")

Build Hybrid Model (Keras 3 Ready)

In [None]:
def build_model():
    url_input = Input(shape=(MAX_LEN,), dtype="int32", name="url_input")

    x = Embedding(input_dim=vocab_size, output_dim=50)(url_input)

    x = SpatialDropout1D(0.3)(x)

    x = Bidirectional(LSTM(64, return_sequences=False))(x)
    x = Dropout(0.5)(x)

    feat_input = Input(shape=(num_features,), dtype="float32", name="features_input")
    y = Dense(64, activation="relu")(feat_input)
    y = BatchNormalization()(y)
    y = Dropout(0.4)(y)
    y = Dense(32, activation="relu")(y)

    merged = Concatenate()([x, y])
    z = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(0.001))(merged)
    z = Dropout(0.5)(z)

    output = Dense(1, activation="sigmoid")(z)

    model = Model(inputs=[url_input, feat_input], outputs=output)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model

model = build_model()
model.summary()

Train Model

In [None]:
BATCH_SIZE = 2048
EPOCHS = 20

callbacks = [
    EarlyStopping(
        monitor="val_loss",
        patience=4,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=2,
        min_lr=0.00001,
        verbose=1
    ),
    ModelCheckpoint(
        "best_hybrid_model.keras",
        monitor="val_loss",
        save_best_only=True,
        verbose=1
    )
]

print(f"üöÄ Start Training on {len(X_seq_train):,} samples...")
print(f"   Batch Size: {BATCH_SIZE}")

history = model.fit(
    {"url_input": X_seq_train, "features_input": X_feat_train},
    y_train,
    validation_split=0.1,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

Evaluate

In [None]:
print("\n Evaluating on Test Set...")

loss, accuracy = model.evaluate(
    {"url_input": X_seq_test, "features_input": X_feat_test},
    y_test,
    batch_size=BATCH_SIZE,
    verbose=1
)
print(f"Test Accuracy: {accuracy*100:.2f}%")

print(" Saving files...")
model.save("final_hybrid_model.keras")

with open("tokenizer.pkl", "wb") as f: pickle.dump(tokenizer, f)
with open("scaler.pkl", "wb") as f: pickle.dump(scaler, f)
with open("label_encoder.pkl", "wb") as f: pickle.dump(le, f)

meta = {"max_len": MAX_LEN, "num_features": num_features}
with open("hybrid_meta.pkl", "wb") as f: pickle.dump(meta, f)

print("All files saved successfully!")

from google.colab import files
try:
    files.download("best_hybrid_model.keras")
    files.download("tokenizer.pkl")
    files.download("scaler.pkl")
except Exception as e:
    print("‚ö† Auto-download failed (browser block?), please download manually from sidebar.")

Save Model + Tools

In [None]:
model.save("final_hybrid_model.keras")

with open("tokenizer.pkl", "wb") as f: pickle.dump(tokenizer, f)
with open("scaler.pkl", "wb") as f: pickle.dump(scaler, f)
with open("label_encoder.pkl", "wb") as f: pickle.dump(le, f)

meta = {"max_len": MAX_LEN, "num_features": num_features}
with open("hybrid_meta.pkl", "wb") as f: pickle.dump(meta, f)

print("‚úî All files saved!")


Auto-Download All Output Files

In [None]:
import pickle
import numpy as np
import tensorflow as tf
model = tf.keras.models.load_model("/content/final_hybrid_model.keras")

with open("/content/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

with open("/content/scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open("/content/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

with open("/content/hybrid_meta.pkl", "rb") as f:
    meta = pickle.load(f)

MAX_LEN = meta["max_len"]
FEATURE_LEN = meta["num_features"]

print("‚úÖ ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡∏∞‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡πÇ‡∏´‡∏•‡∏î‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à‡πÅ‡∏•‡πâ‡∏ß!")

In [None]:
import re
from urllib.parse import urlparse

def extract_features(url):
    url = str(url)

    parsed = urlparse(url)
    domain = parsed.netloc
    path = parsed.path

    features = [
        len(url),
        len(domain),
        len(path),
        url.count('-'),
        url.count('@'),
        url.count('?'),
        url.count('='),
        url.count('.'),
        url.count('/'),
        1 if "https" in url else 0,
        1 if "@" in url else 0,
        1 if "//" in url else 0,
        len(re.findall(r"[0-9]", url)),
        len(re.findall(r"[A-Z]", url)),
    ]

    return np.array(features)


In [None]:
def preprocess(url):

    seq = tokenizer.texts_to_sequences([url])
    seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=MAX_LEN)

    feat = extract_features(url)
    feat = feat.reshape(1, -1)
    feat = scaler.transform(feat)

    return seq, feat


In [None]:
def predict_url(url):
    seq, feat = preprocess(url)

    pred = model.predict(
        {"url_input": seq, "features_input": feat},
        verbose=0
    )[0]

    label = np.argmax(pred)
    label_text = label_encoder.inverse_transform([label])[0]

    confidence = float(np.max(pred)) * 100

    return label_text, confidence, pred

