# Imports #

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier



In [3]:
df = pd.read_csv("../data/labeled_intersection.csv")

# Drop non-informative columns
df = df.drop(columns=["userid"], errors='ignore')

# Separate features and target
X = df.drop(columns=["label"])
y = df["label"]

# Check resulting shapes
print(f"X shape: {X.shape}")
print(f"y distribution:\n{y.value_counts(normalize=True)}")
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


X shape: (129302, 2)
y distribution:
label
0    0.756756
1    0.243244
Name: proportion, dtype: float64


# Random forest #

In [4]:
# Define Random Forest
rf = RandomForestClassifier(
    random_state=42,
    class_weight="balanced",  # handle imbalance
    n_jobs=-1
)

# Hyperparameter grid
param_dist = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Randomized Search
search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=cv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the model
search.fit(X_train, y_train)

# Evaluate
print("Best ROC AUC score on validation folds:", search.best_score_)
print("Best hyperparameters:", search.best_params_)

# Test set performance
best_rf = search.best_estimator_
y_pred = best_rf.predict(X_test)
y_proba = best_rf.predict_proba(X_test)[:, 1]

print("\nTest ROC AUC score:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best ROC AUC score on validation folds: 0.9050222545866202
Best hyperparameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10}

Test ROC AUC score: 0.905178533673471

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.83      0.88     19570
           1       0.62      0.83      0.71      6291

    accuracy                           0.83     25861
   macro avg       0.78      0.83      0.80     25861
weighted avg       0.86      0.83      0.84     25861



# Save radnom forest model #

In [5]:
import joblib

# Save the trained model
joblib.dump(best_rf, 'trained-model/random_forest_model.joblib')

['random_forest_model.joblib']

# Convert to random forest joblib cnnx for GPU usage #

In [None]:
import joblib
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# === Load the model ===
model = joblib.load('../Numeric_Features_model/trained-model/random_forest_model.joblib')

# === Define input shape (2 features: followers, avg_retweetcount) ===
initial_type = [('input', FloatTensorType([None, 2]))]

# === Convert the model ===
onnx_model = convert_sklearn(model, initial_types=initial_type)

# === Save the ONNX model ===
with open("../Numeric_Features_model/trained-model/rf_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [12]:
import gc
del search
del param_dist
del y_pred
del y_proba
gc.collect()

1000