# Implementation: Fraud Detection (Imbalanced Data)

We will mimic a credit card fraud dataset where fraud is rare.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

sns.set_theme()

# 1. Generate Data (99% Legit, 1% Fraud)
X, y = make_classification(n_samples=10000, n_features=20, n_classes=2, weights=[0.99, 0.01], random_state=42)

print(f"Class Counts: {np.bincount(y)}")
print(f"Fraud Percentage: {y.mean() * 100:.2f}%")

## 2. Baseline Model (No fix)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Baseline Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nBaseline Report:")
print(classification_report(y_test, y_pred))
print("Notice Recal for Class 1 might be low (or 0)!")

## 3. SMOTE (Synthetic Over-sampling)

In [None]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

print(f"After SMOTE Class Counts: {np.bincount(y_train_sm)}")

# Retrain
model_sm = LogisticRegression()
model_sm.fit(X_train_sm, y_train_sm)
y_pred_sm = model_sm.predict(X_test)

print("\nSMOTE Classification Report:")
print(classification_report(y_test, y_pred_sm))

## 4. Class Weights (Cost-Sensitive)

In [None]:
model_cw = LogisticRegression(class_weight='balanced')
model_cw.fit(X_train, y_train)
y_pred_cw = model_cw.predict(X_test)

print("\nClass Weight Classification Report:")
print(classification_report(y_test, y_pred_cw))