In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px  # Interactive plots (optional)
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# EDA & profiling

# Interactive EDA
# import pandasgui
# from pandasgui import show
# show(your_dataframe)  # Opens an interactive GUI

# Auto-EDA
# from dataprep.eda import create_report
# create_report(your_dataframe).show()

# Lightweight Profiling 
import sweetviz as sv
# sv.analyze(your_dataframe).show_html()

# from pandas_profiling import ProfileReport  # Auto-EDA (install: `pip install ydata-profiling`)
import missingno as msno  # Missing data visualization (install: `pip install missingno`)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set(style="darkgrid",font_scale=1.5)
pd.set_option("display.max.columns",None)
pd.set_option("display.max.rows",None)
# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)  # Suppress user warnings
warnings.filterwarnings("ignore", category=FutureWarning)  # Suppress future warnings

# Suppress specific warnings for LGBMClassifier and CatBoostClassifier
import logging
logging.getLogger("catboost").setLevel(logging.ERROR)  # Suppress CatBoost logs
logging.getLogger("lightgbm").setLevel(logging.ERROR)  # Suppress LightGBM logs


In [None]:
# Scaling/normalization
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
    LabelEncoder,
    OneHotEncoder,
    PowerTransformer,
    label_binarize
)

# Splitting data
from sklearn.model_selection import train_test_split, StratifiedKFold

# Imputation (handling missing values)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Outlier detection
from sklearn.ensemble import IsolationForest
from scipy import stats  # For Z-score, IQR


In [None]:
# Classic ML
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,  # Faster alternative to CatBoost
    AdaBoostClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
    VotingClassifier,
    StackingClassifier
)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier  # Works on Python 3.13.2
from lightgbm import LGBMClassifier  # Works on Python 3.13.2
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

# Neural Networks (optional)
# import tensorflow as tf  # or `pip install tensorflow-cpu`
# from keras.models import Sequential
# from keras.layers import Dense, Dropout

In [None]:
# Metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    mean_squared_error,
    roc_curve
)


In [None]:
# Cross-validation
from sklearn.model_selection import cross_val_score, GridSearchCV

# Calibration
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

# Parallel processing
from joblib import Parallel, delayed


from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

# Warnings (to clean output)
import warnings
warnings.filterwarnings("ignore")

# Time tracking
import time
from tqdm import tqdm  # Progress bars (install: `pip install tqdm`)

In [None]:
# pd.set_option("display.max_columns", None)
df = pd.read_csv("C:/Projects/Fraud Transaction Prediction/Fraud.csv")# change this to your path
df.head(10)

In [None]:
# Cross-validation
from sklearn.model_selection import cross_val_score, GridSearchCV

# Calibration
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

# Parallel processing
from joblib import Parallel, delayed


from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

# Warnings (to clean output)
import warnings
warnings.filterwarnings("ignore")

# Time tracking
import time
from tqdm import tqdm  # Progress bars (install: `pip install tqdm`)

## EDA

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
unique_values = df['step'].unique()
print(unique_values) 

#### Checking for Missing Values

In [None]:
df.isnull().sum().to_frame().rename(columns={0:"Total No. of Missing Values"})

#### Checking for Duplicates 

In [None]:
print("Duplicate Values =",df.duplicated().sum())

#### Checking for numeric data columns 

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.head()

#### Checking for categorical data columns

In [None]:
# df.select_dtypes('object').columns 
# or you can use below
categorical_data = df.select_dtypes(exclude=[np.number])
categorical_data.head()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def plot_correlation_matrix(df, size=8):
    """
    Plots a correlation matrix for numeric columns.
    
    Parameters:
    df : pandas.DataFrame
    size : int - Output figure size
    """
    # Select only numeric columns
    numeric_df = df.select_dtypes(include=[np.number])
    
    # Calculate correlations
    corr = numeric_df.corr()
    
    # Set up the matplotlib figure
    plt.figure(figsize=(size, size))
    
    # Generate a heatmap
    sns.heatmap(corr, 
                annot=True, 
                fmt=".2f", 
                cmap='coolwarm',
                center=0,
                square=True,
                linewidths=.5)
    
    plt.title("Correlation Matrix")
    plt.tight_layout()
    plt.show()

##### it looks like we have categorical data that we need to take care of

### further exploration and visualizations

In [None]:
plotCorrelationMatrix(df1, 8)

In [None]:
numeric_data.corr()

In [None]:
fig = px.imshow(numeric_data.corr(),text_auto=True,aspect="auto")
fig.show()

In [None]:
df['type'].value_counts(ascending=False)

In [None]:
print(df.columns.tolist())

In [None]:
df.describe()

In [None]:
df.hist(figsize=(15,10))
plt.suptitle("Data Distribution", fontsize=16)
plt.figure(figsize=(2, 2))
plt.show()

In [None]:
sns.countplot(x='isFraud',data=df, palette='hls')
plt.title("Imbalanced Fraud vs. Non-Fraud Distribution", fontsize=14)
plt.figure(figsize=(5, 5))
plt.show()
plt.savefig('count_plot')

In [None]:
ratio_df = df[['isFraud', 'amount']].groupby(['isFraud']).count()
ratio_df.plot.pie(y='amount', figsize=(5, 5), labels=['0: nonfraud', '1: fraud'], autopct='%.1f%%', startangle=120, wedgeprops={'width': 0.75},title = 'Imbalanced Fraud vs. Non-Fraud Distribution')

In [None]:
#boxplot
df.boxplot(column=["step","amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest"])
plt.title("Box Plot Analysis", fontsize=16)
plt.xticks(rotation=90, fontsize=12)
plt.figure(figsize=(5, 5))
plt.show()

In [None]:
# Correlation test between the qualitative variable 'oldbalanceOrg' and the target variable
import scipy.stats as stats
from sklearn.metrics import roc_auc_score

# 1. Point-Biserial Correlation
corr, p_value = stats.pointbiserialr(df['isFraud'], df['oldbalanceOrg'])
print(f"Point-Biserial Correlation: {corr:.3f} (p-value: {p_value:.3e})")

# 2. Welch's t-test (unequal variances)
fraud_bal = df.loc[df['isFraud'] == 1, 'oldbalanceOrg']
nonfraud_bal = df.loc[df['isFraud'] == 0, 'oldbalanceOrg']
t_stat, p_val = stats.ttest_ind(fraud_bal, nonfraud_bal, equal_var=False)
print(f"t-test: Mean difference = {fraud_bal.mean() - nonfraud_bal.mean():.1f} (p-value: {p_val:.3e})")

# 3. AUC-ROC Evaluation
auc = roc_auc_score(df['isFraud'], df['oldbalanceOrg'])
print(f"AUC: {auc:.3f} (0.5 = random, 1 = perfect)")

In [None]:
A_test=stats.f_oneway(df['oldbalanceOrg'][df['isFraud']==1],df['oldbalanceOrg'][df['isFraud']==0])
print(A_test)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.boxplot(x='isFraud', y='oldbalanceOrg', data=df, showfliers=False)
plt.yscale('log')  # Use if data is highly skewed
plt.title("Distribution of oldbalanceOrg by Fraud Status")
plt.xlabel("Is Fraud? (0=No, 1=Yes)")
plt.ylabel("Original Balance (log scale)")
plt.show()

In [None]:
# Analyze balance distribution for fraud vs non-fraud
print("Fraudulent transactions balance percentiles:")
print(df[df['isFraud']==1]['oldbalanceOrg'].quantile([0.25, 0.5, 0.75, 0.9]))

print("\nNon-fraudulent transactions balance percentiles:")
print(df[df['isFraud']==0]['oldbalanceOrg'].quantile([0.25, 0.5, 0.75, 0.9]))

In [None]:
# Calculate percentiles for fraud vs non-fraud transactions
fraud_stats = df[df['isFraud']==1]['amount'].describe(percentiles=[.25, .5, .75, .9, .95, .99])
nonfraud_stats = df[df['isFraud']==0]['amount'].describe(percentiles=[.25, .5, .75, .9, .95, .99])

print("Fraudulent Transactions Amount Stats:")
print(fraud_stats[['min', '25%', '50%', '75%', '90%', '95%', '99%', 'max']])

print("\nNon-Fraudulent Transactions Amount Stats:")
print(nonfraud_stats[['min', '25%', '50%', '75%', '90%', '95%', '99%', 'max']])

In [None]:
def get_dynamic_thresholds(df, feature, is_fraud, upper_percentile=0.95, lower_percentile=0.05):
    """Calculate thresholds based on percentiles of fraud/non-fraud data"""
    fraud_vals = df[df['isFraud']==is_fraud][feature]
    return {
        'upper': fraud_vals.quantile(upper_percentile),
        'lower': fraud_vals.quantile(lower_percentile),
        'mean': fraud_vals.mean()
    }

# Example usage for amount
amount_thresholds = {
    'fraud': get_dynamic_thresholds(df, 'amount', is_fraud=1),
    'nonfraud': get_dynamic_thresholds(df, 'amount', is_fraud=0)
}
print("\nAmount Thresholds:")
print(amount_thresholds)

In [None]:
# Set thresholds based on 95th percentile of non-fraud (adjust as needed)
high_amount_thresh = nonfraud_stats['95%']
balance_change_thresh = df[df['isFraud']==0]['oldbalanceOrg'].quantile(0.9)

# Engineered features with dynamic thresholds
df['high_amount_flag'] = (df['amount'] > high_amount_thresh).astype(int)
df['suspicious_balance_change'] = (
    (df['oldbalanceOrg'] - df['newbalanceOrig']) > balance_change_thresh
).astype(int)

# Ratio-based feature with smoothing
df['amount_to_balance_ratio'] = df['amount'] / (df['oldbalanceOrg'] + 1)  # +1 prevents divide-by-zero

# Time-sensitive features (if 'step' is in hours)
# Set thresholds based on 95th percentile of non-fraud (adjust as needed)
high_amount_thresh = nonfraud_stats['95%']
balance_change_thresh = df[df['isFraud']==0]['oldbalanceOrg'].quantile(0.9)

# Engineered features with dynamic thresholds
df['high_amount_flag'] = (df['amount'] > high_amount_thresh).astype(int)
df['suspicious_balance_change'] = (
    (df['oldbalanceOrg'] - df['newbalanceOrig']) > balance_change_thresh
).astype(int)

# Ratio-based feature with smoothing
df['amount_to_balance_ratio'] = df['amount'] / (df['oldbalanceOrg'] + 1)  # +1 prevents divide-by-zero

# # Time features 
# # Feature 1: Hour of day (1-24, where 1 = 00:00-00:59 of any day)
# df['hour_of_day'] = ((df['step'] - 1) % 24) + 1  # Converts to 1-24 range

# # Feature 2: Day of simulation (1-31, since 743 hours ≈ 30.96 days)
# df['day'] = ((df['step'] - 1) // 24) + 1  # 1-based day count

# # Feature 3: Day of week (0=Monday to 6=Sunday)
# df['day_of_week'] = ((df['step'] - 1) // 24) % 7  # 0-based weekday

# # Feature 4: Weekend flag (1 if Saturday/Sunday)
# df['is_weekend'] = ((df['day_of_week'] == 5) | (df['day_of_week'] == 6)).astype(int)

# # Feature 5: Transaction burst (1 if same user transacts again within 1 hour)
# df['txn_burst'] = (df.groupby('nameOrig')['step'].diff() == 1).astype(int)

# # Feature 6: Hours since last transaction (NaN for first txn per user)
# df['hours_since_last_txn'] = df.groupby('nameOrig')['step'].diff()

# # Feature 7: Transactions per user in last 24 hours (rolling window)
# df['txn_count_24h'] = df.groupby('nameOrig')['step'].transform(
#     lambda x: x.rolling(24, min_periods=1).count()
# )

In [None]:
# Check fraud rates in new features
print("\nFraud Rates by Engineered Features:")
print(df.groupby('high_amount_flag')['isFraud'].mean())
print(df.groupby('suspicious_balance_change')['isFraud'].mean())

# Visual confirmation
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))
plt.subplot(121)
df.boxplot(column='amount_to_balance_ratio', by='isFraud', showfliers=False)
plt.ylim(0, 5)  # Focus on 0-500% range
plt.subplot(122)
df.groupby('hour_of_day')['isFraud'].mean().plot()
plt.title("Fraud Rate by Hour of Day")
plt.tight_layout()
plt.show()

In [None]:
def engineer_features(df, amount_percentile=0.95, balance_percentile=0.9):
    """Complete feature engineering with dynamic thresholds"""
    # Calculate thresholds
    amt_thresh = df[df['isFraud']==0]['amount'].quantile(amount_percentile)
    bal_thresh = df[df['isFraud']==0]['oldbalanceOrg'].quantile(balance_percentile)
    
    # Transaction features
    df['amount_to_balance'] = df['amount'] / (df['oldbalanceOrg'] + 1)
    df['high_amount_flag'] = (df['amount'] > amt_thresh).astype(int)
    
    # Balance features
    df['balance_change_abs'] = df['oldbalanceOrg'] - df['newbalanceOrig']
    df['balance_change_ratio'] = df['balance_change_abs'] / (df['oldbalanceOrg'] + 1)
    df['suspicious_withdrawal'] = (
        (df['balance_change_abs'] > bal_thresh) & 
        (df['amount_to_balance'] > 0.5)
    ).astype(int)
    
    # Type-specific features
    df['large_cashout'] = (
        (df['type'] == 'CASH_OUT') & 
        (df['amount_to_balance'] > 0.7)
    ).astype(int)
    
    return df

# Apply to your dataframe
df = engineer_features(df)

In [None]:

# Time features 
# Feature 1: Hour of day (1-24, where 1 = 00:00-00:59 of any day)
df['hour_of_day'] = ((df['step'] - 1) % 24) + 1  # Converts to 1-24 range

# Feature 2: Day of simulation (1-31, since 743 hours ≈ 30.96 days)
df['day'] = ((df['step'] - 1) // 24) + 1  # 1-based day count

# Feature 3: Day of week (0=Monday to 6=Sunday)
df['day_of_week'] = ((df['step'] - 1) // 24) % 7  # 0-based weekday

# Feature 4: Weekend flag (1 if Saturday/Sunday)
df['is_weekend'] = ((df['day_of_week'] == 5) | (df['day_of_week'] == 6)).astype(int)

# Feature 5: Transaction burst (1 if same user transacts again within 1 hour)
df['txn_burst'] = (df.groupby('nameOrig')['step'].diff() == 1).astype(int)

# Feature 6: Hours since last transaction (NaN for first txn per user)
df['hours_since_last_txn'] = df.groupby('nameOrig')['step'].diff()

# Feature 7: Transactions per user in last 24 hours (rolling window)
df['txn_count_24h'] = df.groupby('nameOrig')['step'].transform(
    lambda x: x.rolling(24, min_periods=1).count()
)

In [None]:
df.head(10)

#### below features are just for exploration purposes, the purpose of this exercise is to learn run and keep it simple without going into extreme details

In [None]:
# # Feature engineering
# # Set threshold at 90th percentile of non-fraud transactions
# balance_threshold = df[df['isFraud']==0]['oldbalanceOrg'].quantile(0.9)
# txn_ratio_threshold = 0.5  # 50% of balance

# df['high_risk_balance'] = (df['oldbalanceOrg'] > balance_threshold).astype(int)
# df['suspicious_withdrawal'] = (
#     (df['oldbalanceOrg'] > balance_threshold) & 
#     (df['amount'] > txn_ratio_threshold * df['oldbalanceOrg'])
# )

In [None]:
# # Flag transactions in top 5% of amounts
# amount_threshold = df['amount'].quantile(0.95)
# df['large_txn_flag'] = (df['amount'] > amount_threshold).astype(int)

# # Combined flag
# df['high_risk_combo'] = (
#     df['high_risk_balance'] | 
#     df['large_txn_flag']
# ).astype(int)

In [None]:
# # Feature Engineering 
# # High-risk balance threshold (adjust based on quartiles)  
# df['high_risk_balance'] = (df['oldbalanceOrg'] > 1_000_000).astype(int)  

# # Interaction with transaction amount  
# df['large_balance_large_txn'] = (df['oldbalanceOrg'] > 500_000) & (df['amount'] > 0.9 * df['oldbalanceOrg'])  

In [None]:
# # List of all features we created 
# engineered_features = [
#     'high_risk_balance',
#     'large_balance_large_txn', 
#     'large_txn_flag',
#     'high_risk_combo',
#     'suspicious_withdrawal'
# ]

# # Safely remove columns
# df = df.drop(columns=[col for col in engineered_features if col in df.columns], errors='ignore')

# # Verify removal
# print("Current columns:", df.columns.tolist())

#### First: Encode Categorical Variables
(Convert text → numbers before handling imbalance)

In [None]:
df = pd.get_dummies(df, columns=['type'], prefix='type') # handling the column 'type' first 
# Verify encoding
print(df.columns)

#### Handling High-Cardinality ID Columns (nameOrig, nameDest)
These columns appear to be transaction IDs (unique identifiers). Since they have extremely high cardinality (millions of unique values), do NOT one-hot encode them. Instead:

In [None]:
# df = df.drop(['nameOrig', 'nameDest'], axis=1)  # Remove ID columns
# Remove non-feature columns (including original categorical)
X = df.drop(['isFraud', 'nameOrig', 'nameDest'], axis=1)
y = df['isFraud']

# Check all remaining features are numeric
print(X.dtypes)

In [None]:
# Randomly sample 100K rows for feature analysis
sample_idx = np.random.choice(len(X), 100000, replace=False)
X_sample = X.iloc[sample_idx]
y_sample = y.iloc[sample_idx]

rf = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
rf.fit(X_sample, y_sample)

In [None]:
rf = RandomForestClassifier(
    n_estimators=30,       # Reduced from 50
    max_depth=5,           # Shallower trees
    min_samples_leaf=100,  # Larger leaf nodes
    n_jobs=-1,             # Use all CPU cores
    random_state=42
)
rf.fit(X, y)  # Now runs much faster

In [None]:
# Faster permutation importance (works with partial data)
from sklearn.inspection import permutation_importance

result = permutation_importance(
    rf, X_sample, y_sample,
    n_repeats=3,
    random_state=42,
    n_jobs=-1
)

sorted_idx = result.importances_mean.argsort()[::-1]
plt.barh(X.columns[sorted_idx][:15], result.importances_mean[sorted_idx][:15])
plt.title("Permutation Importance")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X, y)  # Now works with numeric-only data

# Plot importance
pd.Series(rf.feature_importances_, index=X.columns).nlargest(15).plot(kind='barh')
plt.title("Top Predictive Features")
plt.show()

### Check if the data is imbalanced

In [None]:
fraud_count = df['isFraud'].value_counts()
fraud_count

In [None]:
fraud_percentage = df['isFraud'].value_counts(normalize=True) * 100
fraud_percentage

#### it looks like the data is highly imbalanced

In [None]:
df.head(10)

#### First, Split Your Data (Critical!)

In [None]:
x = df.drop(['isFraud'], axis = 1)
y = df['isFraud']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42,stratify=y)

In [None]:
print("x_train - >  ",x_train.shape)
print("y_train - >  ",y_train.shape)
print("x_test  - >  ",x_test.shape)
print("y_test  - >  ",y_test.shape)

#### Scale/transform

In [None]:
pt = PowerTransformer(method='yeo-johnson')

In [None]:
x_train_scaled = pt.fit_transform(x_train)
x_test_scaled = pt.transform(x_test)

#### handle Imbalance 

In [None]:
# encoder = {}
# for i in df.select_dtypes('object').columns:
#     encoder[i] = LabelEncoder()
#     df[i] = encoder[i].fit_transform(df[i])

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=0.3, random_state=42)  # 3:10 fraud/non-fraud ratio
X_res, y_res = smote.fit_resample(x_train, y_train)

print("Resampled class counts:", y_res.value_counts())

### Train and Predict

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

def evaluate_model(model, X_train, y_train, X_test, y_test, name):
    """Train and evaluate a single model, returning metrics and plots."""
    try:
        # Train
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
        
        # Metrics
        report = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)
        
        # Plot
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=['Predicted Non-Fraud', 'Predicted Fraud'],
                   yticklabels=['Actual Non-Fraud', 'Actual Fraud'])
        plt.title(f'{name}\nFP: {cm[0,1]} | FN: {cm[1,0]}')
        plt.gca().add_patch(plt.Rectangle((1, 0), 1, 1, fill=False, edgecolor='red', lw=2))
        plt.gca().add_patch(plt.Rectangle((0, 1), 1, 1, fill=False, edgecolor='orange', lw=2))
        plt.show()
        
        # Return metrics
        return {
            'Model': name,
            'Recall (Fraud)': report['1']['recall'],
            'Precision (Fraud)': report['1']['precision'],
            'F1 (Fraud)': report['1']['f1-score'],
            'ROC AUC': roc_auc_score(y_test, y_proba) if y_proba is not None else None,
            'Type I (FP)': cm[0, 1],
            'Type II (FN)': cm[1, 0]
        }
    except Exception as e:
        print(f"❌ Failed for {name}: {str(e)}")
        return None

In [None]:
results = []  
from xgboost import XGBClassifier
xgb = XGBClassifier(
    scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
    n_estimators=100,
    max_depth=6,
    tree_method='hist',  # Faster than exact
    eval_metric='aucpr',
    n_jobs=-1,
    random_state=42
)
results.append(evaluate_model(xgb, X_res, y_res, x_test_scaled, y_test, "XGBoost (Fast)"))
del xgb

In [None]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(
    class_weight='balanced',
    n_estimators=100,
    max_depth=5,
    learning_rate=0.05,
    feature_fraction=0.8,
    n_jobs=-1,
    random_state=42
)
results.append(evaluate_model(lgbm, X_res, y_res, x_test_scaled, y_test, "LightGBM"))
del lgbm

In [None]:
# 3. Random Forest (Balanced)
# ==============================================
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    class_weight='balanced',
    n_estimators=50,
    max_depth=7,
    max_samples=0.8,
    n_jobs=-1,
    random_state=42
)
results.append(evaluate_model(rf, X_res, y_res, x_test_scaled, y_test, "Random Forest"))
del rf

In [None]:
# 4. Gradient Boosting (Lightweight)
# ==============================================
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    random_state=42
)
results.append(evaluate_model(gb, X_res, y_res, x_test_scaled, y_test, "Gradient Boosting"))
del gb

In [None]:
# 5. Logistic Regression (Fast)
# ==============================================
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(
    class_weight='balanced',
    solver='liblinear',
    penalty='l1',
    max_iter=200,
    n_jobs=-1,
    random_state=42
)
results.append(evaluate_model(logreg, X_res, y_res, x_test_scaled, y_test, "Logistic Reg"))
del logreg

In [None]:
# # 3. Support Vector Classifier - SVC (Caution)
# # ==============================================
# from sklearn.svm import SVC
# svc = SVC(
#     class_weight='balanced',
#     probability=True,
#     kernel='rbf',
#     gamma='scale',
#     random_state=42,
#     cache_size=1000  # Helps with memory
# )
# svc_results = evaluate_model(svc, X_res, y_res, x_test_scaled, y_test, "SVC")
# if svc_results: results.append(svc_results)
# del svc

In [None]:
# 6. AdaBoost (Quick)
# ==============================================
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=0.5,
    random_state=42
)
results.append(evaluate_model(ada, X_res, y_res, x_test_scaled, y_test, "AdaBoost"))
del ada

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(
    class_weight='balanced',
    n_estimators=50,
    max_depth=7,
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)
results.append(evaluate_model(et, X_res, y_res, x_test_scaled, y_test, "Extra Trees"))
del et

#### Combine and Rank Rsults

In [None]:
# Convert results to DataFrame - THIS MUST COME AFTER ALL MODELS RUN
results_df = pd.DataFrame([r for r in results if r is not None])

# Rank models by fraud detection performance
ranked_df = results_df.sort_values([
    'Recall (Fraud)', 
    'F1 (Fraud)',
    'Type II (FN)'
], ascending=[False, False, True])

# Add rank column
ranked_df['Rank'] = range(1, len(ranked_df)+1)

# Highlight top 3 models
def highlight_top3(s):
    top3 = s.nlargest(3).index
    return ['background-color: #FFFF00' if i in top3 else '' for i in range(len(s))]

ranked_df.style.apply(highlight_top3, subset=['Recall (Fraud)', 'F1 (Fraud)'])