In [5]:
import pandas as pd
import numpy as np
import re
import unicodedata
import os
import joblib
import xgboost as xgb
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm

MAX_FEATURE_URLS = None
CHECKPOINT_DIR = "."

RAW_KEYWORDS = [
    'login', 'secure', 'update', 'verify', 'account', 'bank', 'signin', 'submit',
    'paypal', 'ebay', 'confirm', 'wp', 'mail', 'admin', '88', '365', 'bet', '68', '86',
    'xoso', 'casino', 'bong88', 'banca', '1xbet', 'sex', 'jav', 'xxx', 'phim', 'phim18',
    'phimmoi', '18+', 'hdsex', 'livechat', 'gai goi', 'xo so', 'tructiepbongda',
    'lo de', 'da ga', 'keo bong', 'phim cap 3', 'phim sex', 'phim jav','phimlau','vay tien','giai ngan','789','vay','tien','co bac',
    'sicbo','baccarat','blackjack','sanh game','no hu','tai xiu','xoc dia','game bai','porn','phimmoiz', 'phimhd','phimbo', 'vietsub',
    'tra gop','lo to','da ga','rut tien','hentai','lon','dit','xx','trung thuong','trung iphone','quay so','qua tang',
    'nhan thuong','khuyen mai'
]

def remove_vietnamese_diacritics(text):
    return re.sub(r'\s+', '', ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn')).lower()

SUSPICIOUS_KEYWORDS = [remove_vietnamese_diacritics(w) for w in RAW_KEYWORDS]

def normalize_url(url):
    return url if url.startswith("http") else "http://" + url

def strip_scheme_www(url):
    parsed = urlparse(normalize_url(url))
    return parsed.netloc.replace("www.", "") + parsed.path + (f"?{parsed.query}" if parsed.query else "")

class URLFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        return np.array([
            [
                len(url),
                url.count('.'),
                url.count('-'),
                int(bool(re.search(r'(\d{1,3}\.){3}\d{1,3}', url))),
                sum(1 for word in SUSPICIOUS_KEYWORDS if word in url.lower())
            ]
            for url in tqdm(X, desc="Trích URL feature")
        ])

class TitleFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, title_mapping=None):
        self.title_mapping = title_mapping or {}
    def fit(self, X, y=None): return self
    def transform(self, X):
        features = []
        for url in X:
            s_url = strip_scheme_www(url)
            title = self.title_mapping.get(s_url, '')
            title_clean = remove_vietnamese_diacritics(title)
            keyword_count = sum(1 for word in SUSPICIOUS_KEYWORDS if word in title_clean)
            features.append([len(title_clean), keyword_count])
        return np.array(features)

def load_datasets():
    print("\nĐang tải và gộp dữ liệu...")

    df1 = pd.read_csv("/kaggle/input/dataset/openphish.csv")[['url']].assign(label=1, title='', source='openphish')
    df2 = pd.read_csv("/kaggle/input/dataset/urlhaus.csv")[['url']].assign(label=1, title='', source='urlhaus')
    df3 = pd.read_csv("/kaggle/input/dataset/phishtank.csv")[['url']].assign(label=1, title='', source='phishtank')
    df4 = pd.read_csv("/kaggle/input/dataset/legit.csv", header=None, names=["index", "domain"])
    df4['url'] = "http://" + df4['domain'].astype(str)
    df4 = df4[['url']].assign(label=0, title='', source='legit')
    df5 = pd.read_csv("/kaggle/input/dataset/phishing_dataset_with_titles (2).csv").fillna({'title': ''})
    df5['source'] = 'phishing_dataset'
    df_black = pd.read_csv("/kaggle/input/dataset/blackbook.csv", header=None, names=['domain'])
    df_black['url'] = "http://" + df_black['domain'].astype(str)
    df_black = df_black[['url']].assign(label=1, title='', source='blackbook')

    df_phish = pd.concat([df1, df2, df3, df5[df5['label'] == 1], df_black], ignore_index=True).drop_duplicates(subset='url')
    df_legit = pd.concat([df4, df5[df5['label'] == 0]], ignore_index=True).drop_duplicates(subset='url')

    min_len = min(len(df_phish), len(df_legit))
    df_phish = df_phish.sample(n=min_len, random_state=42)
    df_legit = df_legit.sample(n=min_len, random_state=42)

    df = pd.concat([df_phish, df_legit], ignore_index=True)
    df['url'] = df['url'].astype(str).apply(strip_scheme_www)
    df['title'] = df['title'].fillna('')

    print(f"Dữ liệu sau cân bằng: {len(df_phish)} URL nguy hiểm, {len(df_legit)} URL an toàn")

    title_mapping = dict(zip(df['url'], df['title']))
    return df[['url', 'label']], title_mapping

def train_model():
    df, title_mapping = load_datasets()
    X = df['url']
    y = df['label']

    print("Trích xuất đặc trưng...")
    vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), max_features=5000)
    feature_union = FeatureUnion([
        ('tfidf', vectorizer),
        ('url_custom', URLFeatureExtractor()),
        ('title_custom', TitleFeatureExtractor(title_mapping=title_mapping))
    ])

    if MAX_FEATURE_URLS:
        X = X[:MAX_FEATURE_URLS]
        y = y[:MAX_FEATURE_URLS]

    X_features = feature_union.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    params = {
        "max_depth": 20,
        "objective": "binary:logistic",
        "tree_method": "hist",
        "device": "cuda",
        "eval_metric": "logloss",
        "verbosity": 3
    }

    total_rounds = 1000
    round_step = 50

    existing_checkpoints = sorted([
        int(f.replace("checkpoint_", "").replace(".json", ""))
        for f in os.listdir(CHECKPOINT_DIR)
        if f.startswith("checkpoint_") and f.endswith(".json")
    ])
    start_round = max(existing_checkpoints) if existing_checkpoints else 0
    model = None

    print(f"Bắt đầu từ vòng {start_round + 1}")
    for i in range(start_round, total_rounds, round_step):
        print(f"Huấn luyện vòng {i+1} đến {i+round_step}...")
        prev_model_path = os.path.join(CHECKPOINT_DIR, f"checkpoint_{i}.json") if i > 0 else None

        model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=round_step,
            xgb_model=prev_model_path,
            evals=[(dtest, "eval")],
            early_stopping_rounds=30
        )

        ckpt_path = os.path.join(CHECKPOINT_DIR, f"checkpoint_{i + round_step}.json")
        model.save_model(ckpt_path)
        print(f"Đã lưu checkpoint: {ckpt_path}")

    print("Đánh giá mô hình:")
    y_pred = (model.predict(dtest) > 0.5).astype(int)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    model.save_model("final_model.json")
    joblib.dump(model, "model.pkl")
    joblib.dump(feature_union, "vectorizer.pkl")
    print("Đã lưu model, model.pkl và vectorizer.pkl.")

if __name__ == "__main__":
    train_model()



Đang tải và gộp dữ liệu...
Dữ liệu sau cân bằng: 1439128 URL nguy hiểm, 1439128 URL an toàn
Trích xuất đặc trưng...


Trích URL feature: 100%|██████████| 2878256/2878256 [00:48<00:00, 59413.81it/s] 


Bắt đầu từ vòng 1
Huấn luyện vòng 1 đến 50...
[16:37:35] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[16:37:35] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[16:37:36] INFO: /workspace/src/data/simple_dmatrix.cc:137: Generating new Ellpack page.
[16:37:36] MakeCuts: 0.005274s, 1 calls @ 5274us

[16:37:36] Prune: 0.004351s, 1 calls @ 4351us

[16:37:36] ScanInput: 0.068288s, 1 calls @ 68288us

[16:37:36] Unique: 0.00392s, 1 calls @ 3920us

[0]	eval-logloss:0.48116
[1]	eval-logloss:0.36460
[2]	eval-logloss:0.29059
[3]	eval-logloss:0.24101
[4]	eval-logloss:0.20746
[5]	eval-logloss:0.18356
[6]	eval-logloss:0.16678
[7]	eval-logloss:0.15522
[8]	eval-logloss:0.14659
[9]	eval-logloss:0.13918
[10]	eval-logloss:0.13423
[11]	eval-logloss:0.13108
[12]	eval-logloss:0.12809
[13]	eval-logloss:0.12502
[14]	eval-logloss:0.12304
[15]	eval-logloss:0.12169
[16]	eval-logloss:0.12052
[17]	eval-logloss:0.11880
[18]	eval-logloss:0.11721
[19]	eval-logloss:0.11630
[