In [7]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

# Ensure path for src imports
sys.path.append(os.path.abspath(".."))
from src.preprocessing import preprocess_data

# Feature Engineering and Transformation

This notebook handles advanced feature creation, encoding, scaling, and imbalance correction for the fraud detection model.

## 1. Setup and Data Loading

Load and preprocess the fraud data using existing cleaning functions.

In [10]:
# Load cleaned fraud data
merged_df = preprocess_data("../data/raw/Fraud_Data.csv", "../data/raw/IpAddress_to_Country.csv")
print(f"Loaded data shape: {merged_df.shape}")
print(merged_df.head())

KeyError: "None of [Index(['Amount'], dtype='object')] are in the [columns]"

## 2. Feature Engineering (Creation)

Create time-based, velocity, and frequency features.

In [None]:
# Time features
merged_df['hour_of_day'] = merged_df['purchase_time'].dt.hour
merged_df['day_of_week'] = merged_df['purchase_time'].dt.dayofweek

# Velocity feature
merged_df['time_since_signup'] = (merged_df['purchase_time'] - merged_df['signup_time']).dt.total_seconds()

# Transaction frequency per user
user_transaction_count = merged_df.groupby('user_id')['user_id'].transform('count')
merged_df['transaction_frequency'] = user_transaction_count

print("Features added:")
print(merged_df[['hour_of_day', 'day_of_week', 'time_since_signup', 'transaction_frequency']].head())

## 3. Transformation (Encoding and Scaling)

Apply one-hot encoding to categorical features and scaling to numerical ones.

In [None]:
# One-hot encoding
categorical_cols = ['source', 'browser', 'sex']
encoded_df = pd.get_dummies(merged_df[categorical_cols], drop_first=True)
merged_df = pd.concat([merged_df.drop(categorical_cols, axis=1), encoded_df], axis=1)

# Scaling (only numerical features, exclude one-hot and target)
scaler = StandardScaler()
numerical_cols = ['purchase_value', 'age']
merged_df[numerical_cols] = scaler.fit_transform(merged_df[numerical_cols])

print(f"Data shape after transformation: {merged_df.shape}")
print(merged_df.head())

## 4. Imbalance Handling (SMOTE)

Split data and apply SMOTE to the training set only.

In [None]:
# Define X and y
X = merged_df.drop(['class', 'user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address', 'lower_bound_ip_address', 'upper_bound_ip_address'], axis=1).select_dtypes(include=[float, int, bool])
y = merged_df['class']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"y_train shape before SMOTE: {y_train.shape}")
print(y_train.value_counts())

# Apply SMOTE to training data only
# Since SMOTE is not available, using simple oversampling with resample
train_data = X_train.copy()
train_data['class'] = y_train

majority = train_data[train_data['class'] == 0]
minority = train_data[train_data['class'] == 1]

minority_oversampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)

train_oversampled = pd.concat([majority, minority_oversampled])

X_train_sm = train_oversampled.drop('class', axis=1)
y_train_sm = train_oversampled['class']

print(f"y_train shape after SMOTE: {y_train_sm.shape}")
print(y_train_sm.value_counts())

## 5. Save Processed Data

Save the training and test sets to processed folder.

In [None]:
# Save training data (SMOTE applied)
train_df = pd.concat([X_train_sm, y_train_sm], axis=1)
train_df.to_csv('../data/processed/train_enc_smote.csv', index=False)

# Save test data (untouched)
test_df = pd.concat([X_test, y_test], axis=1)
test_df.to_csv('../data/processed/test_enc.csv', index=False)

print("Data saved successfully.")