# Data Transformation and Class Imbalance Handling
This notebook handles scaling, encoding, and addressing class imbalance using SMOTE.

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import joblib
import os

# Load feature engineered data
df = pd.read_csv('../data/processed/Fraud_Data_features.csv')

# Target and Features
X = df.drop(['class', 'user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address'], axis=1)
y = df['class']

## 1. Encoding and Splitting
One-Hot Encoding categorical variables and splitting into train/test sets.

In [2]:
X = pd.get_dummies(X, columns=['source', 'browser', 'sex', 'country'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

Train size: (120889, 195), Test size: (30223, 195)


## 2. Scaling
Normalizing numerical features.

In [3]:
scaler = StandardScaler()
num_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day', 'day_of_week', 'user_id_count', 'device_id_count']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

## 3. Handling Class Imbalance (SMOTE)
Balancing the training set.

In [4]:
print(f"Before SMOTE: {y_train.value_counts().to_dict()}")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print(f"After SMOTE: {y_train_res.value_counts().to_dict()}")

Before SMOTE: {0: 109568, 1: 11321}
After SMOTE: {0: 109568, 1: 109568}
