# Feature Engineering
This notebook documents the feature engineering process for the Fraud Detection project, focusing on the E-commerce dataset.

In [None]:
import pandas as pd
import numpy as np
import os

# Load cleaned data
fraud_df = pd.read_csv('../data/processed/Fraud_Data_cleaned.csv')
ip_df = pd.read_csv('../data/raw/IpAddress_to_Country.csv')

print(f"Fraud Data Shape: {fraud_df.shape}")
print(f"IP Mapping Shape: {ip_df.shape}")


## 1. Geolocation Integration
Mapping IP addresses to countries using range-based lookup.

In [None]:
# Convert IP to int64 for range matching
fraud_df['ip_address'] = fraud_df['ip_address'].astype(np.int64)
ip_df['lower_bound_ip_address'] = ip_df['lower_bound_ip_address'].astype(np.int64)
ip_df['upper_bound_ip_address'] = ip_df['upper_bound_ip_address'].astype(np.int64)

# Sort for merge_asof
fraud_df = fraud_df.sort_values('ip_address')
ip_df = ip_df.sort_values('lower_bound_ip_address')

# Merging using asof (matches where ip_address >= lower_bound)
df_merged = pd.merge_asof(
    fraud_df, 
    ip_df, 
    left_on='ip_address', 
    right_on='lower_bound_ip_address'
)

# Validate upper bound
df_merged['country'] = np.where(
    df_merged['ip_address'] <= df_merged['upper_bound_ip_address'],
    df_merged['country'],
    'Unknown'
)

# Drop intermediate artifacts
df_merged = df_merged.drop(['lower_bound_ip_address', 'upper_bound_ip_address'], axis=1)
df_merged.head()

## 2. Feature Extraction
Adding time-based features and transaction velocity.

In [None]:
df_merged['signup_time'] = pd.to_datetime(df_merged['signup_time'])
df_merged['purchase_time'] = pd.to_datetime(df_merged['purchase_time'])

# 2.1 Time since signup
df_merged['time_since_signup'] = (df_merged['purchase_time'] - df_merged['signup_time']).dt.total_seconds()

# 2.2 Time-of-day and Day-of-week
df_merged['hour_of_day'] = df_merged['purchase_time'].dt.hour
df_merged['day_of_week'] = df_merged['purchase_time'].dt.dayofweek

# 2.3 Transaction frequency (Velocity)
df_merged['user_id_count'] = df_merged.groupby('user_id')['user_id'].transform('count')
df_merged['device_id_count'] = df_merged.groupby('device_id')['device_id'].transform('count')

df_merged.head()

## 3. Save Processed Data
Saving the output for the transformation step.

In [None]:
os.makedirs('../data/processed', exist_ok=True)
df_merged.to_csv('../data/processed/Fraud_Data_features.ipynb_output.csv', index=False)
print("Feature engineering complete. Data saved.")