In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from collections import Counter
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib
import pickle
import time
import os
import warnings

warnings.filterwarnings("ignore")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
data = pd.read_csv('/content/drive/MyDrive/Classroom/IS252/Đồ Án Data Mining/Click_Through_Rate Prediction Data/Clean_Data_V2.csv')
data

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Gender,Country,Clicked on Ad,month,day,hour
0,62.26,32.0,69481.85,172.83,96,234,1,174,0,5,8,21
1,41.73,31.0,61840.26,207.17,301,460,1,166,0,0,15,17
2,44.40,30.0,57877.15,172.83,484,379,0,71,0,5,28,10
3,59.88,28.0,56180.93,207.17,24,269,0,205,0,5,20,14
4,49.21,30.0,54324.73,201.58,484,495,0,149,1,6,20,10
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,41.73,31.0,61840.26,207.17,353,460,1,166,1,0,2,3
9996,41.73,28.0,51501.38,120.49,241,177,1,105,0,4,27,12
9997,55.60,39.0,38067.08,124.44,222,316,0,48,0,0,4,11
9998,46.61,50.0,43974.49,123.13,396,321,0,108,1,3,3,7


## Define input, output data

In [None]:
X = data.drop('Clicked on Ad', axis=1)
y = data['Clicked on Ad'].astype(int)

## Splitting data to train/test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Define model Classifier

In [None]:
xgb = XGBClassifier(random_state = 42, n_jobs = -1, use_label_encoder=False, eval_metric='logloss')
lgbm = LGBMClassifier(class_weight = 'balanced',random_state = 42, n_jobs = -1, verbosity = -1)
cat = CatBoostClassifier(thread_count = -1, random_seed = 42, auto_class_weights = 'SqrtBalanced', verbose = 0)
CLASSIFIERS = [xgb, lgbm, cat]

In [None]:
for model in tqdm(CLASSIFIERS):
    name = type(model).__name__

    print(f"{name}")
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    print(f"Accuracy Train: {accuracy_score(y_train, y_pred_train):.4f} | Accuracy Test: {accuracy_score(y_test, y_pred_test):.4f}\n")
    print("==" * 30)

  0%|          | 0/3 [00:00<?, ?it/s]

XGBClassifier


Parameters: { "use_label_encoder" } are not used.



Accuracy Train: 0.9870 | Accuracy Test: 0.8790

LGBMClassifier
Accuracy Train: 0.9349 | Accuracy Test: 0.8725

CatBoostClassifier
Accuracy Train: 0.9289 | Accuracy Test: 0.8680



## Fine-Tune

In [None]:
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 7, 10, 15, None],
    'learning_rate': [0.1, 0.2, 0.5],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.4, 0.5, 0.6],
    'min_child_weight': [0.5, 1, 3],
    'gamma': [0, 0.1, 0.2, 0.3]
}

param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 7, 10, None],
    'learning_rate': [0.1, 0.2, 0.5],
    'num_leaves': [20, 31, 40],
    'subsample': [0.5, 1.0],
    'colsample_bytree': [0.5, 1.0],
    'min_child_samples': [5, 10, 20],
    'reg_alpha': [0.1, 0.5],
    'reg_lambda': [0.1, 0.5]
}

param_grid_cat = {
    'iterations': [100, 200, 300],
    'depth': [6, 8, 10, None],
    'learning_rate': [0.1, 0.2, 0.5],
    'l2_leaf_reg': [3, 5, 7],
    'bagging_temperature': [0.5, 1],
    'random_strength': [1, 5, 10],
    'border_count': [32, 64, 128],
    'scale_pos_weight': [1, 2, 3]
}

In [None]:
from sklearn.model_selection import ParameterGrid

## XGB

In [None]:
best_score_xgb = 0
best_params_xgb = None
best_model_xgb = None

param_list = list(ParameterGrid(param_grid_xgb))
with tqdm(total=len(param_list), desc="Grid Search Progress") as pbar:
    for params in param_list:
        model = XGBClassifier(random_state=42, **params)
        model.fit(X_train, y_train)

        score = accuracy_score(y_test, model.predict(X_test))

        if score > best_score_xgb:
            best_score_xgb = score
            best_params_xgb = params
            best_model_xgb = model

        pbar.update(1)

# Lưu mô hình tốt nhất
model_path_xgb = "best_xgb_model.joblib"
joblib.dump(best_model_xgb, model_path_xgb)

print("Best parameters found: ", best_params_xgb)
print("Best accuracy score: ", best_score_xgb)

Grid Search Progress: 100%|██████████| 4860/4860 [32:00<00:00,  2.53it/s]

Best parameters found:  {'colsample_bytree': 0.4, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': None, 'min_child_weight': 0.5, 'n_estimators': 200, 'subsample': 0.7}
Best accuracy score:  0.899





## LightGBM

In [None]:
best_score_lgbm = 0
best_params_lgbm = None
best_model_lgbm = None

param_list_lgbm = list(ParameterGrid(param_grid_lgbm))
with tqdm(total=len(param_list_lgbm), desc="Grid Search Progress for LGBM") as pbar:
    for params in param_list_lgbm:
        model = LGBMClassifier(random_state=42, **params)
        model.fit(X_train, y_train)

        score = accuracy_score(y_test, model.predict(X_test))

        if score > best_score_lgbm:
            best_score_lgbm = score
            best_params_lgbm = params
            best_model_lgbm = model

        pbar.update(1)

# Lưu mô hình tốt nhất
model_path_lgbm = "best_lgbm_model.joblib"
joblib.dump(best_model_lgbm, model_path_lgbm)

print("Best parameters found for LGBM: ", best_params_lgbm)
print("Best accuracy score for LGBM: ", best_score_lgbm)
print(f"Best LGBM model saved to {model_path_lgbm}")

Output hidden; open in https://colab.research.google.com to view.

## CatBoost

In [None]:
best_score_cat = 0
best_params_cat = None
best_model_cat = None

param_list_cat = list(ParameterGrid(param_grid_cat))
with tqdm(total=len(param_list_cat), desc="Grid Search Progress for CatBoost") as pbar:
    for params in param_list_cat:
        model = CatBoostClassifier(random_seed=42, **params)
        model.fit(X_train, y_train)

        score = accuracy_score(y_test, model.predict(X_test))

        if score > best_score_cat:
            best_score_cat = score
            best_params_cat = params
            best_model_cat = model

        pbar.update(1)

# Lưu mô hình tốt nhất
model_path_cat = "best_cat_model.joblib"
joblib.dump(best_model_cat, model_path_cat)

print("Best parameters found for CatBoost: ", best_params_cat)
print("Best accuracy score for CatBoost: ", best_score_cat)
print(f"Best CatBoost model saved to {model_path_cat}")