# Imports #

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier



In [4]:
df = pd.read_csv("data/labeled_intersection.csv")

# Drop non-informative columns
df = df.drop(columns=["userid"], errors='ignore')

# Separate features and target
X = df.drop(columns=["label"])
y = df["label"]

# Check resulting shapes
print(f"X shape: {X.shape}")
print(f"y distribution:\n{y.value_counts(normalize=True)}")
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


X shape: (129302, 9)
y distribution:
label
0    0.756756
1    0.243244
Name: proportion, dtype: float64


# XGBOOST #

In [4]:


# Set up the base classifier
xgb = XGBClassifier(
    objective='binary:logistic',
    tree_method='gpu_hist',  # use 'gpu_hist' if GPU is available
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()  # handles class imbalance
)

# Hyperparameter grid
param_dist = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.03, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8],
    'gamma': [0, 1, 3],
    'min_child_weight': [1, 5, 10],
}

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Setup the search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=cv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
# Run search
random_search.fit(X_train, y_train)

# Print best results

# Final model prediction and evaluation
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("\nTest ROC AUC score:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 30 candidates, totalling 150 fits



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Test ROC AUC score: 0.8893484921845751

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.84      0.88     19570
           1       0.62      0.82      0.70      6291

    accuracy                           0.83     25861
   macro avg       0.78      0.83      0.79     25861
weighted avg       0.86      0.83      0.84     25861




    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


# Random forest #

In [5]:
# Define Random Forest
rf = RandomForestClassifier(
    random_state=42,
    class_weight="balanced",  # handle imbalance
    n_jobs=-1
)

# Hyperparameter grid
param_dist = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Randomized Search
search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=cv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the model
search.fit(X_train, y_train)

# Evaluate
print("Best ROC AUC score on validation folds:", search.best_score_)
print("Best hyperparameters:", search.best_params_)

# Test set performance
best_rf = search.best_estimator_
y_pred = best_rf.predict(X_test)
y_proba = best_rf.predict_proba(X_test)[:, 1]

print("\nTest ROC AUC score:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 30 candidates, totalling 150 fits


KeyboardInterrupt: 

In [12]:
import gc
del search
del param_dist
# del best_model
del y_pred
del y_proba
gc.collect()

1000