# 1. Load dataset and Split

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import mlflow
import mlflow.sklearn

# Load processed dataset
df = pd.read_csv('../data/processed/processed_data.csv')

# Define features and target
X = df.drop(columns=['CustomerId', 'TransactionId', 'BatchId', 'SubscriptionId', 'is_high_risk'])
X = X.select_dtypes(include=['int64', 'float64'])
y = df['is_high_risk']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)


#  3. Train + Track Models with MLflow

In [7]:
from sklearn.model_selection import GridSearchCV

# Enable MLflow autologging
mlflow.sklearn.autolog()

with mlflow.start_run(run_name="LogReg-v1"):
    logreg = LogisticRegression(max_iter=500)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)

    roc = roc_auc_score(y_test, y_pred)
    mlflow.log_metric("ROC_AUC", roc)
    mlflow.sklearn.log_model(logreg, "logistic_model")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# 4. Try Random Forest with GridSearch

In [8]:
with mlflow.start_run(run_name="RF-v1"):
    rf = RandomForestClassifier(random_state=42)
    param_grid = {'n_estimators': [50, 100], 'max_depth': [5, 10, None]}
    
    grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)
    
    roc = roc_auc_score(y_test, y_pred)
    mlflow.log_metric("ROC_AUC", roc)
    mlflow.sklearn.log_model(best_rf, "random_forest_model")


2025/06/30 20:17:29 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.
