In [3]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Load raw dataset
df = pd.read_csv("../data/raw/creditcard.csv")

# Remove duplicate rows to avoid data leakage
df.drop_duplicates(inplace=True)

# Sort by Time to ensure chronological order — required for time-based features
df.sort_values(by='Time', inplace=True)

# Extract hour of day from Time (seconds since first transaction)
df['hour_of_day'] = (df['Time'] // 3600) % 24

# Time delta: seconds between consecutive transactions
# First transaction has no previous — fill with 0
df['time_delta'] = df['Time'].diff().fillna(0)

# Log-transform Amount to reduce skewness and outlier impact
df['Amount'] = np.log1p(df['Amount'])

# Z-score of Amount (computed on log-transformed values)
# Measures how many standard deviations a transaction is from the mean
mean_amount = df['Amount'].mean()
std_amount = df['Amount'].std()
df['amount_zscore'] = (df['Amount'] - mean_amount) / std_amount

# Transaction velocity: number of transactions in the last 300 seconds
# High velocity is a strong fraud signal — thieves act fast
df_temp = df.set_index(pd.to_datetime(df['Time'], unit='s'))
df['velocity_300s'] = df_temp.rolling('300s')['Amount'].count().values

# Drop Time — no longer needed, all time-based features are extracted
df.drop(columns=['Time'], inplace=True)

# Train/test split — stratified to preserve class imbalance ratio
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# SMOTE applied only on train set to handle class imbalance
# Never apply SMOTE on test set — that would be data leakage
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Save processed data
X_train_smote.to_csv('../data/processed/X_train_smote.csv', index=False)
y_train_smote.to_csv('../data/processed/y_train_smote.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

print(f"Train set: {X_train_smote.shape}, Test set: {X_test.shape}")

Train set: (453204, 33), Test set: (56746, 33)
