## Feature Engineering

### 1. Fraud Data Feature Engineering

In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
from src.data_loader import load_fraud_data, remove_duplicates
from src.features import (
    add_time_features_fraud,
    scale_numeric,
    encode_categorical
)
from pathlib import Path

# Load enriched data (already has time features from EDA)
data_path = Path("../data/processed")
fraud_df = pd.read_csv(data_path / "fraud_data_enriched.csv", 
                       parse_dates=["signup_time", "purchase_time"])
print("Shape:", fraud_df.shape)
fraud_df.head()

Shape: (151112, 17)


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,country,purchase_hour,purchase_dayofweek,time_since_signup_hours,time_since_signup_bin
0,62421,2015-02-16 00:17:05,2015-03-08 10:00:39,46,ZCLZTAJPCRAQX,Direct,Safari,M,36,52093.496895,0,52093,,10,6,489.726111,"(287.999, 575.999]"
1,173212,2015-03-08 04:03:22,2015-03-20 17:23:45,33,YFGYOALADBHLT,Ads,IE,F,30,93447.138961,0,93447,,17,4,301.339722,"(287.999, 575.999]"
2,242286,2015-05-17 16:45:54,2015-05-26 08:54:34,33,QZNVQTUITFTHH,Direct,FireFox,F,32,105818.501505,0,105818,,8,1,208.144444,"(-2.88, 287.999]"
3,370003,2015-03-03 19:58:39,2015-05-28 21:09:13,33,PIBUQMBIELMMG,Ads,IE,M,40,117566.664867,0,117566,,21,3,2065.176111,"(2015.995, 2303.994]"
4,119824,2015-03-20 00:31:27,2015-04-05 07:31:46,55,WFIIFCPIOGMHT,Ads,Safari,M,38,131423.789042,0,131423,,7,6,391.005278,"(287.999, 575.999]"


In [2]:
print("Columns:", fraud_df.columns.tolist())

Columns: ['user_id', 'signup_time', 'purchase_time', 'purchase_value', 'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class', 'ip_int', 'country', 'purchase_hour', 'purchase_dayofweek', 'time_since_signup_hours', 'time_since_signup_bin']


In [3]:
# Numeric features (for scaling)
numeric_cols = ['purchase_value', 'age', 'time_since_signup_hours']

# Categorical features (for one-hot encoding)
cat_cols = ['source', 'browser', 'sex', 'country']

# Drop rows with missing country (or fill with 'Unknown')
fraud_df['country'] = fraud_df['country'].fillna('Unknown')

In [4]:
# Scale numeric
fraud_scaled, scaler = scale_numeric(fraud_df, numeric_cols)

# Encode categorical
fraud_encoded, encoder = encode_categorical(fraud_scaled, cat_cols)

print("Shape after encoding:", fraud_encoded.shape)
fraud_encoded.head()

Shape after encoding: (151112, 201)


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,ip_int,purchase_hour,...,country_Unknown,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Viet Nam,country_Virgin Islands (U.S.),country_Yemen,country_Zambia,country_Zimbabwe
0,62421,2015-02-16 00:17:05,2015-03-08 10:00:39,0.494721,ZCLZTAJPCRAQX,0.331793,52093.496895,0,52093,10,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,173212,2015-03-08 04:03:22,2015-03-20 17:23:45,-0.214781,YFGYOALADBHLT,-0.364448,93447.138961,0,93447,17,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,242286,2015-05-17 16:45:54,2015-05-26 08:54:34,-0.214781,QZNVQTUITFTHH,-0.132367,105818.501505,0,105818,8,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,370003,2015-03-03 19:58:39,2015-05-28 21:09:13,-0.214781,PIBUQMBIELMMG,0.795954,117566.664867,0,117566,21,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,119824,2015-03-20 00:31:27,2015-04-05 07:31:46,0.985915,WFIIFCPIOGMHT,0.563874,131423.789042,0,131423,7,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Columns to drop (high cardinality, identifiers, raw times)
drop_cols = ['device_id', 'ip_address', 'ip_int', 'time_since_signup_bin',
             'signup_time', 'purchase_time']
fraud_final = fraud_encoded.drop(columns=drop_cols, errors='ignore')
print("Final shape:", fraud_final.shape)
fraud_final.head()

Final shape: (151112, 195)


Unnamed: 0,user_id,purchase_value,age,class,purchase_hour,purchase_dayofweek,time_since_signup_hours,source_Direct,source_SEO,browser_FireFox,...,country_Unknown,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Viet Nam,country_Virgin Islands (U.S.),country_Yemen,country_Zambia,country_Zimbabwe
0,62421,0.494721,0.331793,0,10,6,-1.013679,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,173212,-0.214781,-0.364448,0,17,4,-1.230613,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,242286,-0.214781,-0.132367,0,8,1,-1.337931,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,370003,-0.214781,0.795954,0,21,3,0.800513,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,119824,0.985915,0.563874,0,7,6,-1.127359,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
fraud_final.to_csv(data_path / "fraud_data_processed.csv", index=False)