In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import joblib
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [16]:
df = pd.read_csv('../data/processed_data.csv')

In [17]:
df.head()

Unnamed: 0,hyphenRatio,nb_dot,length,pathLengthRatio,urlEntropy,slashRatio,numbersToLettersRatio,numbersRatio,hostnameLength,nb_specialcaracters,...,bayesProba,label,dotRatio,nb_letters,nb_slash,lettersRatio,specialcaractersRatio,hostnameLengthRatio,nb_numbers,pathLength
0,0.726727,0.172673,0.623624,0.404226,0.843957,0.801802,0.713213,0.683684,0.724224,0.901401,...,0.766312,1,0.112613,0.503003,0.82032,0.11011,0.982983,0.506006,0.688188,0.491491
1,0.0,0.172673,0.243744,0.365365,0.506507,0.304805,0.873373,0.870871,0.588088,0.294294,...,0.356706,0,0.325826,0.16967,0.214214,0.143143,0.596096,0.719219,0.737738,0.348849
2,0.0,0.980981,0.792793,0.773052,0.736878,0.423924,0.0,0.0,0.588088,0.796296,...,0.862641,1,0.915243,0.834835,0.642643,0.701259,0.548549,0.26026,0.0,0.808809
3,0.828328,0.549049,0.21972,0.0,0.168168,0.0,0.808308,0.815816,0.843844,0.192192,...,0.701634,1,0.82032,0.197698,0.0,0.305305,0.327327,1.0,0.688188,0.0
4,0.0,0.549049,0.392392,0.574575,0.411411,0.228228,0.0,0.0,0.321822,0.192192,...,0.466466,0,0.645646,0.452953,0.214214,0.812312,0.158158,0.472472,0.0,0.507508


In [18]:
# 'label' is the target column
X = df.drop(columns=['label'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now we have 60% train, 20% validation, and 20% test

print(f"Train: {round(len(X_train)/len(df)*100)}%\nTest: {round(len(X_test)/len(df)*100)}%")

Train: 80%
Test: 20%


# Train

In [19]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [20]:
# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_alpha': [0, 0.1, 0.5, 1, 5, 10],
    'reg_lambda': [0, 0.1, 0.5, 1, 5, 10],
}


In [21]:
# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=30,  # Number of parameter combinations to try
    scoring='accuracy',  # Use accuracy for evaluation
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Perform the random search
random_search.fit(X_train, y_train) # estimated 20 sec for each iter

Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [22]:
# Best parameters and model evaluation
print("Best parameters found: ", random_search.best_params_)
best_model = random_search.best_estimator_

Best parameters found:  {'subsample': 0.8, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 0.2, 'colsample_bytree': 0.9}


# Test

In [23]:
# Test the model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.2f}")

Test set accuracy: 0.99


In [24]:
# Test the model on the test set
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Test Accuracy: 0.9894579394519565
Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     89157
           1       0.99      0.99      0.99     77983

    accuracy                           0.99    167140
   macro avg       0.99      0.99      0.99    167140
weighted avg       0.99      0.99      0.99    167140

Test Confusion Matrix:
 [[88342   815]
 [  947 77036]]


# Save the model

In [25]:
joblib.dump(best_model, "../models_saved/xgboost.joblib")

['../models_saved/xgboost.joblib']