In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [4]:
# Load the data you cleaned in the EDA notebook
fraud_df = pd.read_csv('../data/processed/fraud_data_with_country.csv')

In [5]:
# 1. Time-based features [cite: 111]
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

In [6]:
# Calculate time difference in seconds [cite: 114]
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()

In [7]:
# Calculate time difference in seconds [cite: 114]
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()

In [8]:
# 2. Transaction frequency and velocity [cite: 110]
# Count how many times the same device was used
fraud_df['device_usage_count'] = fraud_df.groupby('device_id')['device_id'].transform('count')

In [9]:
# Count how many times the same IP was used
fraud_df['ip_usage_count'] = fraud_df.groupby('ip_address')['ip_address'].transform('count')

Data Transformation


In [10]:
# 3. Categorical Encoding (One-Hot Encoding) [cite: 117]
# We encode features like source and browser
fraud_df = pd.get_dummies(fraud_df, columns=['source', 'browser', 'sex'], drop_first=True)

# 4. Normalization/Scaling [cite: 116]
scaler = StandardScaler()
num_features = ['purchase_value', 'age', 'time_since_signup', 'device_usage_count', 'ip_usage_count']
fraud_df[num_features] = scaler.fit_transform(fraud_df[num_features])

# For the Bank dataset, scale the 'Amount' feature [cite: 44]
# credit_df['Amount'] = scaler.fit_transform(credit_df[['Amount']])

Handling Class Imbalance

In [11]:
# Separate features from target [cite: 127, 129]
X = fraud_df.drop(['class', 'user_id', 'signup_time', 'purchase_time', 'device_id', 'country'], axis=1)
y = fraud_df['class']

# Document distribution before SMOTE [cite: 121]
print("Before SMOTE:\n", y.value_counts())

# Apply SMOTE 
# Note: In a real workflow, you should split your data FIRST and only apply SMOTE to the training set.
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Document distribution after SMOTE [cite: 121]
print("\nAfter SMOTE:\n", pd.Series(y_resampled).value_counts())

Before SMOTE:
 class
0    136961
1     14151
Name: count, dtype: int64

After SMOTE:
 class
0    136961
1    136961
Name: count, dtype: int64


In [12]:
# Save to the processed data folder
X_resampled.to_csv('../data/processed/fraud_features.csv', index=False)
pd.Series(y_resampled).to_csv('../data/processed/fraud_target.csv', index=False)