In [None]:
import pandas as pd
import numpy as np
import ipaddress
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE


In [None]:
#Load the Data
fraud_df = pd.read_csv("../data/raw/Fraud_Data.csv")
ip_df = pd.read_csv("../data/raw/IpAddress_to_Country.csv")

fraud_df.head()


In [None]:
# Remove Duplicates and Fix Data Types
fraud_df = fraud_df.drop_duplicates()

fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])
fraud_df['age'] = pd.to_numeric(fraud_df['age'], errors='coerce')

fraud_df.info()


In [None]:
# Handle Missing Values
fraud_df['sex'] = fraud_df['sex'].fillna('Unknown')
fraud_df = fraud_df.dropna(subset=['age', 'purchase_value'])

fraud_df.isnull().sum()


In [None]:
# Convert IPs to Integers
def ip_to_int(ip_str):
    return int(ipaddress.IPv4Address(ip_str))

fraud_df['ip_int'] = fraud_df['ip_address'].apply(ip_to_int)
ip_df['lower_bound_ip_address'] = ip_df['lower_bound_ip_address'].apply(ip_to_int)
ip_df['upper_bound_ip_address'] = ip_df['upper_bound_ip_address'].apply(ip_to_int)


In [None]:
# Map IP to Country
def map_ip_to_country(ip):
    match = ip_df[(ip_df['lower_bound_ip_address'] <= ip) & (ip_df['upper_bound_ip_address'] >= ip)]
    return match['country'].values[0] if not match.empty else 'Unknown'

fraud_df['country'] = fraud_df['ip_int'].apply(map_ip_to_country)


In [None]:
# Time-Based Feature Engineering
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()
fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour
fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.dayofweek


In [None]:
#  Frequency-Based Feature Engineering
fraud_df['user_transaction_count'] = fraud_df.groupby('user_id')['user_id'].transform('count')
fraud_df['device_transaction_count'] = fraud_df.groupby('device_id')['device_id'].transform('count')


In [None]:
# One-Hot Encoding of Categoricals
fraud_df = pd.get_dummies(fraud_df, columns=['source', 'browser', 'sex', 'country'], drop_first=True)


In [None]:
# Scaling Numeric Features
scaler = StandardScaler()

scale_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day',
              'day_of_week', 'user_transaction_count', 'device_transaction_count']

fraud_df[scale_cols] = scaler.fit_transform(fraud_df[scale_cols])


In [None]:
# Train-Test Split
X = fraud_df.drop(columns=['class', 'user_id', 'signup_time', 'purchase_time', 'ip_address', 'ip_int'])
y = fraud_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)


In [None]:
# Handle Class Imbalance with SMOTE
print("Before SMOTE:", y_train.value_counts())

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("After SMOTE:", y_train_resampled.value_counts())


: 