# Data Preprocessing and Feature Engineering for Fraud Detection

This notebook performs data preprocessing and feature engineering on the fraud detection datasets:
1. Fraud_Data.csv - E-commerce transaction data
2. IpAddress_to_Country.csv - IP to country mapping
3. creditcard.csv - Bank transaction data

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add the src directory to the path
if os.path.abspath('..') not in sys.path:
    sys.path.append(os.path.abspath('..'))

# Import custom modules
from src.load_data import load_data
from src.preprocessing import (
    handle_missing_values, clean_data, encode_categorical_features,
    scale_features, handle_class_imbalance, convert_ip_to_int
)
from src.feature_engineering import (
    add_time_features, add_transaction_features,
    merge_ip_country_data, add_amount_features
)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Display all columns
pd.set_option('display.max_columns', None)

## 1. Load the Datasets

In [7]:
# Load the datasets
fraud_data = load_data('../data/raw/Fraud_Data.csv')
ip_country = load_data('../data/raw/IpAddress_to_Country.csv')
creditcard = load_data('../data/raw/creditcard.csv')

## 2. Preprocess Fraud_Data.csv

In [8]:
# Clean data
fraud_data = clean_data(fraud_data)

# Handle missing values
fraud_data = handle_missing_values(fraud_data)

Number of duplicates: 0
Missing values before imputation:
Series([], dtype: int64)
Missing values after imputation:
Series([], dtype: int64)


In [None]:
# Merge with IP-to-country data
fraud_data = merge_ip_country_data(fraud_data, ip_country)

# Check the merged data
fraud_data[['ip_address', 'country']].head()

Unnamed: 0,ip_address,country
0,732758400.0,Unknown
1,350311400.0,Unknown
2,2621474000.0,Unknown
3,3840542000.0,Unknown
4,415583100.0,Unknown


In [10]:
fraud_data.head(100)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058.0,2015-02-24 22:55:49,2015-04-18 02:47:11,34.0,QVPSPJUOCKZAR,SEO,Chrome,M,39.0,7.327584e+08,0.0,Unknown
1,333320.0,2015-06-07 20:39:50,2015-06-08 01:38:54,16.0,EOGFQPIZPYXFZ,Ads,Chrome,F,53.0,3.503114e+08,0.0,Unknown
2,1359.0,2015-01-01 18:52:44,2015-01-01 18:52:45,15.0,YSSKYOSJHPPLJ,SEO,Opera,M,53.0,2.621474e+09,1.0,Unknown
3,150084.0,2015-04-28 21:13:25,2015-05-04 13:54:50,44.0,ATGTXKYKUDUQN,SEO,Safari,M,41.0,3.840542e+09,0.0,Unknown
4,221365.0,2015-07-21 07:09:52,2015-09-09 18:40:53,39.0,NAUITBZFJKHWW,Ads,Safari,M,45.0,4.155831e+08,0.0,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
95,10055.0,2015-04-06 23:14:54,2015-05-07 15:21:49,47.0,KDSEIVWEOCIEM,SEO,Chrome,F,36.0,2.755626e+09,0.0,Unknown
96,244618.0,2015-06-11 12:40:52,2015-08-30 08:09:07,81.0,MSPMJUTDCXZSC,SEO,FireFox,M,20.0,3.014511e+09,0.0,Unknown
97,50280.0,2015-05-24 21:16:35,2015-06-01 19:00:06,21.0,IUZMJJBRDCGBZ,Ads,Chrome,M,24.0,4.283641e+09,0.0,Unknown
98,56338.0,2015-03-18 03:36:41,2015-07-11 08:59:24,53.0,KZZDZXHXXLQFD,SEO,Chrome,M,24.0,4.227828e+09,0.0,Unknown


In [None]:
# Add time-based features
fraud_data = add_time_features(fraud_data)

# Add transaction features
fraud_data = add_transaction_features(fraud_data)

# Check the new features
fraud_data.head()

In [None]:
# Encode categorical features
categorical_cols = ['source', 'browser', 'sex', 'country']
fraud_data = encode_categorical_features(fraud_data, categorical_cols)

# Check the encoded features
fraud_data.head()

In [None]:
# Drop unnecessary columns
cols_to_drop = ['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time']
fraud_data = fraud_data.drop(cols_to_drop, axis=1)

# Check the final dataset
fraud_data.head()

In [None]:
# Scale numerical features
fraud_data, scaler_fraud = scale_features(fraud_data)

# Check the scaled features
fraud_data.describe()

In [None]:
# Save the preprocessed data
fraud_data.to_csv('../data/fraud_data_preprocessed.csv', index=False)

## 3. Preprocess creditcard.csv

In [None]:
# Clean data
creditcard = clean_data(creditcard)

# Handle missing values
creditcard = handle_missing_values(creditcard)

In [None]:
# Add time-based features
creditcard = add_time_features(creditcard)

# Add amount-based features
creditcard = add_amount_features(creditcard)

# Check the new features
creditcard.head()

In [None]:
# Scale numerical features
creditcard, scaler_cc = scale_features(creditcard)

# Check the scaled features
creditcard.describe()

In [None]:
# Save the preprocessed data
creditcard.to_csv('../data/creditcard_preprocessed.csv', index=False)

## 4. Prepare Data for Modeling

In [None]:
# Prepare Fraud_Data for modeling
X_fraud = fraud_data.drop('class', axis=1)
y_fraud = fraud_data['class']

# Prepare creditcard for modeling
X_cc = creditcard.drop('Class', axis=1)
y_cc = creditcard['Class']

# Print shapes
print(f"Fraud_Data: X shape = {X_fraud.shape}, y shape = {y_fraud.shape}")
print(f"Creditcard: X shape = {X_cc.shape}, y shape = {y_cc.shape}")

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Fraud_Data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# Creditcard
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(
    X_cc, y_cc, test_size=0.2, random_state=42, stratify=y_cc
)

# Print shapes
print(f"Fraud_Data: X_train shape = {X_train_fraud.shape}, X_test shape = {X_test_fraud.shape}")
print(f"Creditcard: X_train shape = {X_train_cc.shape}, X_test shape = {X_test_cc.shape}")

In [None]:
# Handle class imbalance for Fraud_Data
X_train_fraud_resampled, y_train_fraud_resampled = handle_class_imbalance(
    X_train_fraud, y_train_fraud, method='smote', sampling_strategy=0.1
)

# Handle class imbalance for Creditcard
X_train_cc_resampled, y_train_cc_resampled = handle_class_imbalance(
    X_train_cc, y_train_cc, method='smote', sampling_strategy=0.1
)

In [None]:
# Save the train-test split data
import joblib

# Create directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save Fraud_Data splits
joblib.dump(X_train_fraud, '../data/processed/X_train_fraud.pkl')
joblib.dump(X_test_fraud, '../data/processed/X_test_fraud.pkl')
joblib.dump(y_train_fraud, '../data/processed/y_train_fraud.pkl')
joblib.dump(y_test_fraud, '../data/processed/y_test_fraud.pkl')
joblib.dump(X_train_fraud_resampled, '../data/processed/X_train_fraud_resampled.pkl')
joblib.dump(y_train_fraud_resampled, '../data/processed/y_train_fraud_resampled.pkl')

# Save Creditcard splits
joblib.dump(X_train_cc, '../data/processed/X_train_cc.pkl')
joblib.dump(X_test_cc, '../data/processed/X_test_cc.pkl')
joblib.dump(y_train_cc, '../data/processed/y_train_cc.pkl')
joblib.dump(y_test_cc, '../data/processed/y_test_cc.pkl')
joblib.dump(X_train_cc_resampled, '../data/processed/X_train_cc_resampled.pkl')
joblib.dump(y_train_cc_resampled, '../data/processed/y_train_cc_resampled.pkl')

# Save scalers
joblib.dump(scaler_fraud, '../data/processed/scaler_fraud.pkl')
joblib.dump(scaler_cc, '../data/processed/scaler_cc.pkl')

print("Data splits saved successfully.")

## 5. Summary of Preprocessing Steps

### Fraud_Data.csv
1. Cleaned data by removing duplicates and correcting data types
2. Handled missing values using imputation
3. Merged with IP-to-country data for geolocation analysis
4. Added time-based features (hour_of_day, day_of_week, time_since_signup)
5. Added transaction features (user_transaction_count, time_since_last_transaction, etc.)
6. Encoded categorical features (source, browser, sex, country)
7. Dropped unnecessary columns (user_id, device_id, ip_address, signup_time, purchase_time)
8. Scaled numerical features
9. Split data into training and testing sets
10. Handled class imbalance using SMOTE

### creditcard.csv
1. Cleaned data by removing duplicates and correcting data types
2. Handled missing values using imputation
3. Added time-based features
4. Added amount-based features
5. Scaled numerical features
6. Split data into training and testing sets
7. Handled class imbalance using SMOTE