In [21]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_pickle("train_engineered.pkl")

In [3]:
test_data = pd.read_pickle("test_engineered.pkl")

In [50]:
from sklearn.metrics import ConfusionMatrixDisplay,classification_report,accuracy_score,roc_auc_score, f1_score, average_precision_score

In [19]:
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
X = train_data.drop('isFraud',axis=1)

In [8]:
y = train_data['isFraud']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [10]:
from scipy.stats import randint, uniform

In [11]:
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': [None, 10,20,30],
    'min_samples_split': randint(2,5),
    'min_samples_leaf': randint(1,5),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': ['balanced']
}

In [12]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [13]:
import optuna

In [None]:
def objective(trial):
    # Define the hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators',100, 200),
        'max_depth': trial.suggest_int('max_depth',5,20),
        'min_samples_split': trial.suggest_int('min_samples_split',2,5),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf',1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'random_state': 101,
        'n_jobs': -1
    }

    model = RandomForestClassifier(**params)
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=101)
    
    auc_scores = cross_val_score(model, X, y, scoring='roc_auc', cv=skf, n_jobs=-1)
    return np.mean(auc_scores)

# Create and optimize study
study = optuna.create_study(direction='maximize', study_name="rf_optuna_auc")
study.optimize(objective, n_trials=20, timeout=7200)  # 30 trials or 2 hours

# Print the best hyperparameters
print("Best Trial:")
print(study.best_trial)
print("Best AUC Score:", study.best_value)


[I 2025-06-25 04:39:48,177] A new study created in memory with name: rf_optuna_auc
[I 2025-06-25 04:46:49,018] Trial 0 finished with value: 0.8975422227983396 and parameters: {'n_estimators': 147, 'max_depth': 16, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': False, 'class_weight': None}. Best is trial 0 with value: 0.8975422227983396.
[I 2025-06-25 04:57:56,890] Trial 1 finished with value: 0.9124084532454044 and parameters: {'n_estimators': 131, 'max_depth': 18, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': True, 'class_weight': None}. Best is trial 1 with value: 0.9124084532454044.
[I 2025-06-25 05:21:14,989] Trial 2 finished with value: 0.886742231417453 and parameters: {'n_estimators': 160, 'max_depth': 13, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True, 'class_weight': None}. Best is trial 1 with value: 0.9124084532454044.
[I 2025-06-25 05:32:58,050] Trial 3 finis

Best Trial:
FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.9124084532454044], datetime_start=datetime.datetime(2025, 6, 25, 4, 46, 49, 19820), datetime_complete=datetime.datetime(2025, 6, 25, 4, 57, 56, 890305), params={'n_estimators': 131, 'max_depth': 18, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': True, 'class_weight': None}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=200, log=False, low=100, step=1), 'max_depth': IntDistribution(high=20, log=False, low=5, step=1), 'min_samples_split': IntDistribution(high=5, log=False, low=2, step=1), 'min_samples_leaf': IntDistribution(high=5, log=False, low=1, step=1), 'max_features': CategoricalDistribution(choices=('sqrt', 'log2')), 'bootstrap': CategoricalDistribution(choices=(True, False)), 'class_weight': CategoricalDistribution(choices=('balanced', None))}, trial_id=1, value=None)
Best AUC Score: 0.9124084532454044


In [26]:
best_params = study.best_params

In [27]:
rf_model = RandomForestClassifier(**best_params)

In [28]:
rf_model.fit(X_train,y_train)

In [29]:
y_pred = rf_model.predict(X_test)

In [30]:
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

In [42]:
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy Score: {accuracy:.2f}")

Accuracy Score: 0.98


In [43]:
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.2f}")

ROC-AUC Score: 0.91


In [39]:
f1_score = f1_score(y_test, y_pred)
print(f"F1-Score: {f1_score:.4f}")

F1-Score: 0.5049


In [51]:
pr_auc = average_precision_score(y_test, y_pred_proba)
print(f"Average Precision Score: {pr_auc:.2f}")

Average Precision Score: 0.64


In [53]:
import joblib

In [54]:
joblib.dump(rf_model, 'best_random_forest_model.pkl')

['best_random_forest_model.pkl']