In [8]:
import pandas as pd
import numpy as np

# Load data
fraud_data = pd.read_csv('data/raw/Fraud_Data.csv')

In [9]:
# Convert to datetime
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

In [11]:
# Basic cleaning
fraud_data.drop_duplicates(inplace=True)
fraud_data['age'] = fraud_data['age'].fillna(fraud_data['age'].median())
fraud_data['browser'] = fraud_data['browser'].fillna('Unknown')
fraud_data['source'] = fraud_data['source'].fillna('Unknown')


In [17]:
fraud_data['hour_of_day'] = fraud_data['purchase_time'].dt.hour
fraud_data['day_of_week'] = fraud_data['purchase_time'].dt.dayofweek

fraud_data['time_since_signup'] = (
    fraud_data['purchase_time'] - fraud_data['signup_time']
).dt.total_seconds() / 3600


In [None]:
fraud_data = fraud_data.sort_values(['user_id', 'purchase_time'])

def transactions_last_24h(group):
    times = group['purchase_time']
    return times.apply(
        lambda x: ((times >= x - pd.Timedelta(hours=24)) & (times <= x)).sum()
    )

fraud_data['transactions_last_24h'] = (
    fraud_data.groupby('user_id', group_keys=False)
    .apply(transactions_last_24h)
)


In [None]:
# Number of devices used by a user
fraud_data['devices_per_user'] = (
    fraud_data.groupby('user_id')['device_id']
    .transform('nunique')
)

# Number of IPs used by a user
fraud_data['ips_per_user'] = (
    fraud_data.groupby('user_id')['ip_address']
    .transform('nunique')
)


In [None]:
from sklearn.preprocessing import StandardScaler

# Numerical scaling
num_cols = [
    'purchase_value', 'age',
    'time_since_signup', 'transactions_last_24h',
    'devices_per_user', 'ips_per_user'
]

scaler = StandardScaler()
fraud_data[num_cols] = scaler.fit_transform(fraud_data[num_cols])

# One-hot encoding
fraud_data = pd.get_dummies(
    fraud_data,
    columns=['browser', 'source', 'sex'],
    drop_first=True
)


In [None]:
from sklearn.model_selection import train_test_split

X = fraud_data.drop(
    ['class', 'user_id', 'device_id', 'signup_time', 'purchase_time', 'ip_address'],
    axis=1
)
y = fraud_data['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

train_df = X_train.copy()
train_df['class'] = y_train

test_df = X_test.copy()
test_df['class'] = y_test

train_df.to_csv('data/processed/fraud_train.csv', index=False)
test_df.to_csv('data/processed/fraud_test.csv', index=False)

print("âœ… Processed fraud data saved")
