In [1]:
# =========================================
# Notebook 1: Data Loading & Preprocessing
# =========================================

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("creditcard.csv")  # update path as needed
print("Dataset shape:", df.shape)
print("Fraud cases:", df['Class'].sum())

# Create Hour column from Time
df['Hour'] = (df['Time'] % (24*3600)) // 3600
df['Hour'] = df['Hour'].astype(int)

# Feature Engineering
features = [f'V{i}' for i in range(1, 29)] + ['Amount', 'Time', 'Hour']

X = df[features].copy()
Y = df['Class']

# Scale Amount & Time
scaler = StandardScaler()
X[['Amount', 'Time']] = scaler.fit_transform(X[['Amount', 'Time']])

X['hour_sin'] = np.sin(2 * np.pi * X['Hour'] / 24)
X['hour_cos'] = np.cos(2 * np.pi * X['Hour'] / 24)
X.drop(columns=['Hour'], inplace=True)

# High Amount Flag
high_amount_threshold = df['Amount'].quantile(0.99)
df['High_Amount_Flag'] = (df['Amount'] > high_amount_threshold).astype(int)
X['High_Amount_Flag'] = df['High_Amount_Flag']

# Optional: Amount Bands for analysis
df['Amount_Band'] = pd.cut(df['Amount'],
                           bins=[-1, 10, 100, 1000, df['Amount'].max()],
                           labels=['Very Low', 'Low', 'Medium', 'High'])
amount_risk = df.groupby('Amount_Band', observed=True)['Class'].mean().reset_index()
amount_risk.columns = ['Amount Band', 'Fraud Rate']
print("Fraud rate by Amount Band:\n", amount_risk)

# Save preprocessed features
X.to_csv("X_preprocessed.csv", index=False)
Y.to_csv("Y_preprocessed.csv", index=False)
print("Preprocessed data saved to 'X_preprocessed.csv' and 'Y_preprocessed.csv'")


Dataset shape: (284807, 31)
Fraud cases: 492
Fraud rate by Amount Band:
   Amount Band  Fraud Rate
0    Very Low    0.002483
1         Low    0.000883
2      Medium    0.002259
3        High    0.003061
Preprocessed data saved to 'X_preprocessed.csv' and 'Y_preprocessed.csv'
