## Feature Engineering

### 1. Fraud Data Feature Engineering

In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
from src.data_loader import load_fraud_data, remove_duplicates
from src.features import (
    add_time_features_fraud,
    scale_numeric,
    encode_categorical
)
from pathlib import Path

# Load enriched data (already has time features from EDA)
data_path = Path("../data/processed")
fraud_df = pd.read_csv(data_path / "fraud_data_enriched.csv", 
                       parse_dates=["signup_time", "purchase_time"])
print("Shape:", fraud_df.shape)
fraud_df.head()

Shape: (151112, 17)


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,country,purchase_hour,purchase_dayofweek,time_since_signup_hours,time_since_signup_bin
0,62421,2015-02-16 00:17:05,2015-03-08 10:00:39,46,ZCLZTAJPCRAQX,Direct,Safari,M,36,52093.496895,0,52093,,10,6,489.726111,"(287.999, 575.999]"
1,173212,2015-03-08 04:03:22,2015-03-20 17:23:45,33,YFGYOALADBHLT,Ads,IE,F,30,93447.138961,0,93447,,17,4,301.339722,"(287.999, 575.999]"
2,242286,2015-05-17 16:45:54,2015-05-26 08:54:34,33,QZNVQTUITFTHH,Direct,FireFox,F,32,105818.501505,0,105818,,8,1,208.144444,"(-2.88, 287.999]"
3,370003,2015-03-03 19:58:39,2015-05-28 21:09:13,33,PIBUQMBIELMMG,Ads,IE,M,40,117566.664867,0,117566,,21,3,2065.176111,"(2015.995, 2303.994]"
4,119824,2015-03-20 00:31:27,2015-04-05 07:31:46,55,WFIIFCPIOGMHT,Ads,Safari,M,38,131423.789042,0,131423,,7,6,391.005278,"(287.999, 575.999]"


In [2]:
print("Columns:", fraud_df.columns.tolist())

Columns: ['user_id', 'signup_time', 'purchase_time', 'purchase_value', 'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class', 'ip_int', 'country', 'purchase_hour', 'purchase_dayofweek', 'time_since_signup_hours', 'time_since_signup_bin']


In [3]:
# Numeric features (for scaling)
numeric_cols = ['purchase_value', 'age', 'time_since_signup_hours']

# Categorical features (for one-hot encoding)
cat_cols = ['source', 'browser', 'sex', 'country']

# Drop rows with missing country (or fill with 'Unknown')
fraud_df['country'] = fraud_df['country'].fillna('Unknown')

In [4]:
# Scale numeric
fraud_scaled, scaler = scale_numeric(fraud_df, numeric_cols)

# Encode categorical
fraud_encoded, encoder = encode_categorical(fraud_scaled, cat_cols)

print("Shape after encoding:", fraud_encoded.shape)
fraud_encoded.head()

Shape after encoding: (151112, 201)


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,ip_int,purchase_hour,...,country_Unknown,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Viet Nam,country_Virgin Islands (U.S.),country_Yemen,country_Zambia,country_Zimbabwe
0,62421,2015-02-16 00:17:05,2015-03-08 10:00:39,0.494721,ZCLZTAJPCRAQX,0.331793,52093.496895,0,52093,10,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,173212,2015-03-08 04:03:22,2015-03-20 17:23:45,-0.214781,YFGYOALADBHLT,-0.364448,93447.138961,0,93447,17,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,242286,2015-05-17 16:45:54,2015-05-26 08:54:34,-0.214781,QZNVQTUITFTHH,-0.132367,105818.501505,0,105818,8,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,370003,2015-03-03 19:58:39,2015-05-28 21:09:13,-0.214781,PIBUQMBIELMMG,0.795954,117566.664867,0,117566,21,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,119824,2015-03-20 00:31:27,2015-04-05 07:31:46,0.985915,WFIIFCPIOGMHT,0.563874,131423.789042,0,131423,7,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Columns to drop (high cardinality, identifiers, raw times)
drop_cols = ['device_id', 'ip_address', 'ip_int', 'time_since_signup_bin',
             'signup_time', 'purchase_time']
fraud_final = fraud_encoded.drop(columns=drop_cols, errors='ignore')
print("Final shape:", fraud_final.shape)
fraud_final.head()

Final shape: (151112, 195)


Unnamed: 0,user_id,purchase_value,age,class,purchase_hour,purchase_dayofweek,time_since_signup_hours,source_Direct,source_SEO,browser_FireFox,...,country_Unknown,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Viet Nam,country_Virgin Islands (U.S.),country_Yemen,country_Zambia,country_Zimbabwe
0,62421,0.494721,0.331793,0,10,6,-1.013679,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,173212,-0.214781,-0.364448,0,17,4,-1.230613,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,242286,-0.214781,-0.132367,0,8,1,-1.337931,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,370003,-0.214781,0.795954,0,21,3,0.800513,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,119824,0.985915,0.563874,0,7,6,-1.127359,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
fraud_final.to_csv(data_path / "fraud_data_processed.csv", index=False)

### 2. Credit Card Data Feature Engineering

In [8]:
from src.data_loader import load_creditcard
from src.features import add_time_features_creditcard, scale_numeric

# Load and remove duplicates
cc_raw = load_creditcard(Path("../data/raw"))
print("Raw shape:", cc_raw.shape)

# Remove duplicates (found during EDA)
cc_df = remove_duplicates(cc_raw)
print("After removing duplicates:", cc_df.shape)

# Add time features
cc_df = add_time_features_creditcard(cc_df)
print("Columns after time features:", cc_df.columns.tolist())

Raw shape: (284807, 31)
After removing duplicates: (283726, 31)
Columns after time features: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class', 'hour_of_day', 'day_of_week']


In [9]:
# Scale Amount
cc_scaled, amount_scaler = scale_numeric(cc_df, ['Amount'])
print("Amount scaled.")

# Drop original Time column (keep hour_of_day, day_of_week)
cc_final = cc_scaled.drop(columns=['Time'])
print("Final shape:", cc_final.shape)
cc_final.head()

Amount scaled.
Final shape: (283726, 32)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V23,V24,V25,V26,V27,V28,Amount,Class,hour_of_day,day_of_week
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.2442,0,0.0,0.0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342584,0,0.0,0.0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.1589,0,0.0,0.0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.139886,0,0.0,0.0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073813,0,0.0,0.0


In [10]:
cc_final.to_csv(data_path / "creditcard_processed.csv", index=False)
print("Saved processed creditcard data.")

Saved processed creditcard data.
