<a href="https://colab.research.google.com/github/Mahmudasief/aml-false-positive-reduction-hitl/blob/main/notebooks/02_baseline_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Baseline models for AML false positive reduction
# Dataset: Kaggle Credit Card Fraud (European cardholders)
# Goal: establish BEFORE picture (no tuning, no tricks)

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42

In [4]:
df = pd.read_csv("/content/creditcard.csv")
df.shape

(284807, 31)

In [5]:
# Check class imbalance
df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


In [6]:
# Separate features and target
X = df.drop(columns=['Class'])
y = df['Class']

X.shape, y.shape

((284807, 30), (284807,))

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape, y_train.value_counts(), y_test.value_counts()

((199364, 30),
 (85443, 30),
 Class
 0    199020
 1       344
 Name: count, dtype: int64,
 Class
 0    85295
 1      148
 Name: count, dtype: int64)

In [8]:
# Baseline Logistic Regression (NO tuning)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

lr = LogisticRegression(
    max_iter=1000,
    random_state=42
)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

[[85276    19]
 [   51    97]]
              precision    recall  f1-score   support

           0     0.9994    0.9998    0.9996     85295
           1     0.8362    0.6554    0.7348       148

    accuracy                         0.9992     85443
   macro avg     0.9178    0.8276    0.8672     85443
weighted avg     0.9991    0.9992    0.9991     85443



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Baseline Random Forest (NO tuning)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, digits=4))

[[85290     5]
 [   36   112]]
              precision    recall  f1-score   support

           0     0.9996    0.9999    0.9998     85295
           1     0.9573    0.7568    0.8453       148

    accuracy                         0.9995     85443
   macro avg     0.9784    0.8783    0.9225     85443
weighted avg     0.9995    0.9995    0.9995     85443

