In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [3]:
# Load dataset
creditcard_data = pd.read_csv('data/raw/creditcard.csv')

# -----------------------------
# Data Cleaning
# -----------------------------
# Check for missing values
print(creditcard_data.isnull().sum())  # Usually none
creditcard_data.drop_duplicates(inplace=True)

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [4]:
# -----------------------------
# Feature Engineering
# -----------------------------
# For creditcard.csv, V1-V28 are already PCA features, we can scale Amount
scaler = StandardScaler()
creditcard_data['Amount'] = scaler.fit_transform(creditcard_data[['Amount']])

In [5]:
# -----------------------------
# Train-Test Split
# -----------------------------
X = creditcard_data.drop(['Class'], axis=1)
y = creditcard_data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [6]:
# -----------------------------
# Handle Class Imbalance (SMOTE)
# -----------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Resampled class distribution:")
print(pd.Series(y_train_res).value_counts())

Resampled class distribution:
Class
0    226602
1    226602
Name: count, dtype: int64
