In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib

# Load raw dataset
df = pd.read_csv("../data/raw/creditcard.csv")

# Remove duplicate rows to avoid data leakage
df.drop_duplicates(inplace=True)

# Sort by Time to ensure chronological order — required for time-based features
df.sort_values(by='Time', inplace=True)

# Extract hour of day from Time (seconds since first transaction)
df['hour_of_day'] = (df['Time'] // 3600) % 24

# Time delta: seconds between consecutive transactions
# First transaction has no previous — fill with 0
df['time_delta'] = df['Time'].diff().fillna(0)

# Log-transform Amount to reduce skewness and outlier impact
df['Amount'] = np.log1p(df['Amount'])

# Transaction velocity: number of transactions in the last 300 seconds
# High velocity is a strong fraud signal — thieves act fast
time_index = pd.to_datetime(df['Time'], unit='s')
df['velocity_300s'] = (
    df['Amount']
    .set_axis(time_index)
    .rolling('300s')
    .count()
    .values
)

# Drop Time — no longer needed, all time-based features are extracted
df.drop(columns=['Time'], inplace=True)
# Sanity check — verify time-based features look reasonable
print(df['time_delta'].min())
print(df[df['time_delta'] < 0].shape)
# Train/test split — stratified to preserve class imbalance ratio
# Split must happen before any statistics are computed to avoid data leakage
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Z-score of Amount — computed only on train statistics
# Applying train mean/std to test set prevents data leakage
mean_amount = X_train['Amount'].mean()
std_amount = X_train['Amount'].std()
X_train['amount_zscore'] = (X_train['Amount'] - mean_amount) / std_amount
X_test['amount_zscore'] = (X_test['Amount'] - mean_amount) / std_amount

# StandardScaler — fit only on train, then transform both sets
# Prevents test set statistics from influencing the scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame — scaler returns numpy array
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# SMOTE applied only on train set to handle class imbalance
# Never apply SMOTE on test set — that would be data leakage
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Sanity check — verify feature values look reasonable
print(X_train_smote[['hour_of_day', 'time_delta', 'velocity_300s', 'amount_zscore']].describe())

# Save processed data and scaler for future use
X_train_smote.to_csv('../data/processed/X_train_smote.csv', index=False)
y_train_smote.to_csv('../data/processed/y_train_smote.csv', index=False)
X_test_scaled.to_csv('../data/processed/X_test.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)
joblib.dump(scaler, '../models/scaler.pkl')

print(f"Train: {X_train_smote.shape}, Test: {X_test_scaled.shape}")

         hour_of_day     time_delta  velocity_300s  amount_zscore
count  453204.000000  453204.000000  453204.000000  453204.000000
mean       -0.160424       0.173606      -0.236946      -0.118799
std         1.028043       1.190593       1.174422       1.106499
min        -2.411161      -0.578526      -3.465152      -1.906478
25%        -0.867125      -0.578526      -0.650620      -1.245164
50%        -0.022355      -0.138705       0.216683      -0.143782
75%         0.676910       0.371798       0.492112       0.829928
max         1.534708      29.831852       4.381245       4.062703
Train: (453204, 33), Test: (56746, 33)
