# Model Training

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
train_data = pd.read_csv('../data/1_processed/train_data_encoded.csv')
test_data  = pd.read_csv('../data/1_processed/test_data_encoded.csv')

In [3]:
X_train = train_data.drop(columns=['CLASS'])
y_train = train_data['CLASS']

X_test = test_data.drop(columns=['CLASS'])
y_test = test_data['CLASS']

## Recurcsive Feature Elimination (RFE) and Cross Validation

We use an RFE to rank features and cross-validation to find an optimal feature set.

In [4]:
# Define features and target
features = X_train.columns

# Initialize XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Apply Recursive Feature Elimination
rfe = RFE(estimator=model, n_features_to_select=1, step=1)
rfe.fit(X_train, y_train)

# Get rankings of features
ranked_features = sorted(zip(rfe.ranking_, features))

print("Feature Rankings:")
for rank, feature in ranked_features:
    print(f"{feature}: Rank {rank}")

# Find the best subset of features based on cross-validation
best_score = 0
best_features = []

for i in range(1, len(features) + 1):
    selected_features = [f for rank, f in ranked_features if rank <= i]
    X_train_subset = X_train[selected_features]

    # 5-fold cross-validation for evaluation
    scores = cross_val_score(model, X_train_subset, y_train, scoring='accuracy', cv=5)
    mean_score = scores.mean()

    print(f"Subset: {selected_features}, Accuracy: {mean_score:.4f}")
    if mean_score > best_score:
        best_score = mean_score
        best_features = selected_features

print("\nBest Feature Subset:", best_features)
print(f"Best Accuracy: {best_score:.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Feature Rankings:
USER_HOUR_OF_WEEK: Rank 1
BROWSER_NAME: Rank 2
METRO: Rank 3
OS_FAMILY_NAME: Rank 4
AD_FORMAT: Rank 5
SUPPLY_VENDOR: Rank 6


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Subset: ['USER_HOUR_OF_WEEK'], Accuracy: 0.7180


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Subset: ['USER_HOUR_OF_WEEK', 'BROWSER_NAME'], Accuracy: 0.7457


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Subset: ['USER_HOUR_OF_WEEK', 'BROWSER_NAME', 'METRO'], Accuracy: 0.8014


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Subset: ['USER_HOUR_OF_WEEK', 'BROWSER_NAME', 'METRO', 'OS_FAMILY_NAME'], Accuracy: 0.8034


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Subset: ['USER_HOUR_OF_WEEK', 'BROWSER_NAME', 'METRO', 'OS_FAMILY_NAME', 'AD_FORMAT'], Accuracy: 0.8106


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Subset: ['USER_HOUR_OF_WEEK', 'BROWSER_NAME', 'METRO', 'OS_FAMILY_NAME', 'AD_FORMAT', 'SUPPLY_VENDOR'], Accuracy: 0.8166

Best Feature Subset: ['USER_HOUR_OF_WEEK', 'BROWSER_NAME', 'METRO', 'OS_FAMILY_NAME', 'AD_FORMAT', 'SUPPLY_VENDOR']
Best Accuracy: 0.8166


We achieve the highest accuracy of 0.8166 when using all features.

## Hyperparameters Tuning

**Takes 7 minutes time to run!**

In [5]:

# Define the model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'n_estimators': [100, 200, 500]
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC Score:", grid_search.best_score_)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 1.0}
Best ROC-AUC Score: 0.9137768794630636


The grid search determined the following best hyperparameters:

Best Parameters: {
    
    'colsample_bytree': 0.8, 
    'learning_rate': 0.2, 
    'max_depth': 7, 
    'min_child_weight': 1, '
    n_estimators': 500, 
    'subsample': 1.0
    }
    
Best ROC-AUC Score: 0.9137768794630636

In [6]:
# Define the model
model = xgb.XGBClassifier(
    colsample_bytree=0.8, 
    learning_rate=0.2, 
    max_depth=7, 
    min_child_weight=1,
    n_estimators=500, 
    subsample=1.0,
    use_label_encoder=False, 
    eval_metric='logloss'
    )

model.fit(X_train,y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)

Parameters: { "use_label_encoder" } are not used.



In [7]:
# Save the y_test together with y_pred for later evaluation

# Get the model's name as a string
model_name = type(model).__name__
# Attach class predictions as column
test_data['CLASS_pred'] = y_pred
test_data.to_csv(f'../data/3_evaluation/{model_name}_tunned_results.csv', index=False)