In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import seaborn as sns
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as imb_Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [7]:
train = pd.read_csv('../data/train.csv')
val = pd.read_csv('../data/val.csv')
test = pd.read_csv('../data/test.csv')

In [None]:
def compute_train_stats(train_df):
    stats = {}
    
    # amount stats
    stats['log_amount_mean'] = np.mean(np.log1p(train_df['Amount']))
    stats['log_amount_std']  = np.std(np.log1p(train_df['Amount']))
    
    # amount bins (quantiles)
    stats['amount_bins'] = pd.qcut(
        np.log1p(train_df['Amount']),
        q=5,
        retbins=True
    )[1]  
    
    return stats

def apply_feature_engineering(data, train_stats=None):
    df = data.copy()
    
    # 1. Hour + rush hour
    df['Hour'] = (df['Time'] // 3600) % 24
    df['is_rush_hour'] = df['Hour'].apply(lambda h: 1 if h in [0,1,2] else 0)

    # 2. log amount
    df['log_amount'] = np.log1p(df['Amount'])
    
    # --- if train: calculate stats ---
    if train_stats is None:
        train_stats = {}
        train_stats['log_amount_mean'] = df['log_amount'].mean()
        train_stats['log_amount_std'] = df['log_amount'].std()
        df['amount_z_scores'] = (df['log_amount'] - train_stats['log_amount_mean']) / train_stats['log_amount_std']
        df['is_outlier_amount'] = (df['amount_z_scores'].abs() > 2).astype(int)
        df.drop(columns='amount_z_scores', inplace=True)
        
        df['amount_bin'], bins = pd.qcut(
            df['log_amount'], 
            q=5, 
            labels=["Very Low", "Low", "Medium", "High", "Very High"], 
            retbins=True
        )
        train_stats['amount_bins'] = bins

        # V-features stats
        v_stats = {}
        for i in range(1, 29):
            col = f"V{i}"
            v_stats[col] = {
                "mean": df[col].mean(),
                "std": df[col].std()
            }
            z_col = f"{col}_zscore"
            outlier_col = f"{col}_is_outlier"
            df[z_col] = (df[col] - v_stats[col]["mean"]) / v_stats[col]["std"]
            df[outlier_col] = (df[z_col].abs() > 2).astype(int)
        
        # Drop unwanted
        cols = [f"V{i}_is_outlier" for i in range(1, 29) if i not in [13, 15, 22, 23, 24, 26]]
        df.drop(columns=cols, inplace=True)
        cols = [f"V{i}_zscore" for i in range(1, 29)]
        df.drop(columns=cols, inplace=True)

        train_stats['v_stats'] = v_stats

    # --- if val/test: apply stats from train ---
    else:
        # Use the provided train_stats directly (don't call compute_train_stats again)
        mean = train_stats['log_amount_mean']
        std = train_stats['log_amount_std']
        df['amount_z_scores'] = (df['log_amount'] - mean) / std
        df['is_outlier_amount'] = (df['amount_z_scores'].abs() > 2).astype(int)
        df.drop(columns='amount_z_scores', inplace=True)

        df['amount_bin'] = pd.cut(
            df['log_amount'],
            bins=train_stats['amount_bins'],
            labels=["Very Low", "Low", "Medium", "High", "Very High"],
            include_lowest=True
        )

        for i in range(1, 29):
            col = f"V{i}"
            mean = train_stats['v_stats'][col]["mean"]
            std = train_stats['v_stats'][col]["std"]
            z_col = f"{col}_zscore"
            outlier_col = f"{col}_is_outlier"
            df[z_col] = (df[col] - mean) / std
            df[outlier_col] = (df[z_col].abs() > 2).astype(int)

        cols = [f"V{i}_is_outlier" for i in range(1, 29) if i not in [13, 15, 22, 23, 24, 26]]
        df.drop(columns=cols, inplace=True)
        cols = [f"V{i}_zscore" for i in range(1, 29)]
        df.drop(columns=cols, inplace=True)

    # 3. Interactions
    df['amount_hour_interaction'] = df['log_amount'] * df['Hour']
    df['V7_amount'] = df['V7'] * df['log_amount']
    df['V12_amount'] = df['V12'] * df['log_amount']
    df['V20_amount'] = df['V20'] * df['log_amount']
    df['V11_hour'] = df['V11'] * df['Hour']
    df['V12_hour'] = df['V12'] * df['Hour']

    return df, train_stats

In [13]:
train_engineered, train_stats = apply_feature_engineering(train)
val_engineered,_ = apply_feature_engineering(val,train_stats)
test_engineered,_ = apply_feature_engineering(test,train_stats)

In [57]:
# --- Custom transformer for cyclic encoding ---
class CyclicalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, cols, periods):
        self.cols = cols if isinstance(cols, list) else [cols]
        self.periods = periods if isinstance(periods, list) else [periods]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        for col, period in zip(self.cols, self.periods):
            sin_col = np.sin(2 * np.pi * X_[col] / period)
            cos_col = np.cos(2 * np.pi * X_[col] / period)
            X_[f"{col}_sin"] = sin_col
            X_[f"{col}_cos"] = cos_col
            X_.drop(columns=[col], inplace=True)  # drop original cyclic col
        return X_

# --- Features ---
X = train_engineered.drop(columns=['Class','Amount'], axis=1)
y = train_engineered['Class']

cat_feat = ['amount_bin']   
bin_feat = [
    'V13_is_outlier','V15_is_outlier','V22_is_outlier',
    'V23_is_outlier','V24_is_outlier','V26_is_outlier',
    'is_outlier_amount','is_rush_hour'
] 
cyc_feat = ['Hour']
v_feat = [f"V{i}" for i in range(1,29)]
num_feat = X.drop(columns=cat_feat+bin_feat+cyc_feat+v_feat, axis=1).columns

# --- Preprocessor ---                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
preprocessor = ColumnTransformer([
    ('oe', OrdinalEncoder(), cat_feat),
    ('scaler', StandardScaler(), num_feat),
    ('cyclical', CyclicalFeatures(cols=['Hour'], periods=[24]), cyc_feat)
], remainder='passthrough')


# --- Sampling ---
oversample = SMOTE(sampling_strategy='minority', k_neighbors=5, random_state=1)
undersample = RandomUnderSampler(sampling_strategy='majority', random_state=1)

# --- Pipeline ---
pipeline = imb_Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('over', oversample),
    ('under', undersample),
    ('logistic', LogisticRegression())
    #('xgboost',XGBClassifier())
    #('randomforest',RandomForestClassifier())
])

# --- Fit ---
model = pipeline.fit(X, y)

# --- Check class balance after resampling ---
X_res, y_res = pipeline.named_steps['under'].fit_resample(
    *pipeline.named_steps['over'].fit_resample(
        preprocessor.fit_transform(X), y
    )
)
print("Class balance after sampling:", Counter(y_res))

Class balance after sampling: Counter({0: 170579, 1: 170579})


In [45]:
# --- Custom transformer for cyclic encoding ---
class CyclicalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, cols, periods):
        self.cols = cols if isinstance(cols, list) else [cols]
        self.periods = periods if isinstance(periods, list) else [periods]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        for col, period in zip(self.cols, self.periods):
            sin_col = np.sin(2 * np.pi * X_[col] / period)
            cos_col = np.cos(2 * np.pi * X_[col] / period)
            X_[f"{col}_sin"] = sin_col
            X_[f"{col}_cos"] = cos_col
            X_.drop(columns=[col], inplace=True)  # drop original cyclic col
        return X_

# --- Features ---
X = train_engineered.drop(columns=['Class','Amount'], axis=1)
y = train_engineered['Class']

cat_feat = ['amount_bin']   
bin_feat = [
    'V13_is_outlier','V15_is_outlier','V22_is_outlier',
    'V23_is_outlier','V24_is_outlier','V26_is_outlier',
    'is_outlier_amount','is_rush_hour'
] 
cyc_feat = ['Hour']
v_feat = [f"V{i}" for i in range(1,29)]
num_feat = X.drop(columns=cat_feat+bin_feat+cyc_feat+v_feat, axis=1).columns
cnter = Counter(y)
ir = cnter[1]/cnter[0]
# --- Preprocessor ---                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
preprocessor = ColumnTransformer([
    ('oe', OrdinalEncoder(), cat_feat),
    ('scaler', StandardScaler(), num_feat),
    ('cyclical', CyclicalFeatures(cols=['Hour'], periods=[24]), cyc_feat)
], remainder='passthrough')



# --- Pipeline ---
pipeline = imb_Pipeline(steps=[
    ('preprocessor', preprocessor),
    #('over', oversample),
    #('under', undersample),
    ('logistic', LogisticRegression(class_weight={0:ir,1:1}))
    #('xgboost',XGBClassifier())
    #('randomforest',RandomForestClassifier())
])

# --- Fit ---
model = pipeline.fit(X, y)

In [66]:
y_prob = model.predict_proba(X)[:, 1]                                                                                                                                                                                                 
y_pred = (y_prob > 0.5).astype(int)

# y_pred = model.predict(X)

matrix = confusion_matrix(y,y_pred)
report = classification_report(y,y_pred)

print(matrix)
print(report)

[[165881   4698]
 [    19    286]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    170579
           1       0.06      0.94      0.11       305

    accuracy                           0.97    170884
   macro avg       0.53      0.96      0.55    170884
weighted avg       1.00      0.97      0.98    170884



In [67]:
X_val = val_engineered.drop(columns=['Class','Amount'], axis=1)
y_val = val_engineered['Class']

y_val_pred = model.predict(X_val)

matrix = confusion_matrix(y_val,y_val_pred)
report = classification_report(y_val,y_val_pred)

print(matrix)
print(report)

[[55377  1493]
 [   11    79]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56870
           1       0.05      0.88      0.10        90

    accuracy                           0.97     56960
   macro avg       0.53      0.93      0.54     56960
weighted avg       1.00      0.97      0.99     56960



In [68]:
X_test = test_engineered.drop(columns=['Class','Amount'], axis=1)
y_test = test_engineered['Class']

y_test_pred = model.predict(X_test)

matrix = confusion_matrix(y_test,y_test_pred)
report = classification_report(y_test,y_test_pred)

print(matrix)
print(report)

[[55246  1617]
 [   10    87]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56863
           1       0.05      0.90      0.10        97

    accuracy                           0.97     56960
   macro avg       0.53      0.93      0.54     56960
weighted avg       1.00      0.97      0.98     56960

