## Feature Engineering Strategy

- Time-based features capture suspicious transaction timing.
- Velocity features detect automated or scripted behavior.
- Country features capture geo-risk patterns.
- No target leakage features are introduced.



üåç IP ‚Üí Country


In [1]:

# Fraud Detection Feature Engineering Pipeline
# Allow imports from src/
import sys
from pathlib import Path
import pandas as pd
import numpy as np

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.data_loader import load_fraud_data, load_ip_country_data
from src.preprocessing import clean_fraud_data


# Load raw data
df = load_fraud_data("../data/raw/Fraud_Data.csv")
ip_df = load_ip_country_data("../data/raw/IpAddress_to_Country.csv")

# Clean fraud data
df = clean_fraud_data(df)

df.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [2]:
from src.geo_utils import convert_ip_to_int, merge_ip_country

fraud_df = convert_ip_to_int(df)
fraud_df = merge_ip_country(fraud_df, ip_df)
print(fraud_df.head())

fraud_df[["ip_address", "ip_int", "country"]].head()

     user_id         signup_time       purchase_time  purchase_value  \
634   247547 2015-06-28 03:00:34 2015-08-09 03:57:29              47   
635   220737 2015-01-28 14:21:11 2015-02-11 20:28:28              15   
636   390400 2015-03-19 20:49:09 2015-04-11 23:41:23              44   
637    69592 2015-02-24 06:11:57 2015-05-23 16:40:14              55   
638   174987 2015-07-07 12:58:11 2015-11-03 04:04:30              51   

         device_id  source browser sex  age    ip_address  class    ip_int  \
634  KIXYSVCHIPQBR     SEO  Safari   F   30  1.677886e+07      0  16778864   
635  PKYOWQKWGJNJI     SEO  Chrome   F   34  1.684205e+07      0  16842045   
636  LVCSXLISZHVUO     Ads      IE   M   29  1.684366e+07      0  16843656   
637  UHAUHNXXUADJE  Direct  Chrome   F   30  1.693873e+07      0  16938732   
638  XPGPMOHIDRMGE     SEO  Chrome   F   37  1.697198e+07      0  16971984   

     lower_bound_ip_address  upper_bound_ip_address    country  
634              16778240.0      

Unnamed: 0,ip_address,ip_int,country
634,16778860.0,16778864,Australia
635,16842050.0,16842045,Thailand
636,16843660.0,16843656,China
637,16938730.0,16938732,China
638,16971980.0,16971984,Thailand


‚öôÔ∏è Time & Velocity Features

In [3]:
#üïí Time-Based Features
from src.feature_engineering import add_time_features, add_transaction_velocity

fraud_df = add_time_features(fraud_df)
fraud_df = add_transaction_velocity(fraud_df)
fraud_df.head()
print(fraud_df.head())

  .rolling(window)
  .rolling(window)


        purchase_time  user_id         signup_time  purchase_value  \
0 2015-02-21 10:03:37        2 2015-01-11 03:47:13              54   
1 2015-09-26 21:32:16        4 2015-06-02 16:40:57              41   
2 2015-08-13 11:53:07        8 2015-05-28 07:53:06              47   
3 2015-03-04 20:56:37       12 2015-01-10 06:25:12              35   
4 2015-03-12 12:46:23       16 2015-02-03 13:48:23               9   

       device_id  source browser sex  age    ip_address  ...      ip_int  \
0  FGBQNDNBETFJJ     SEO  Chrome   F   25  8.802175e+08  ...   880217484   
1  MKFUIVOHLJBYN  Direct  Safari   F   38  2.785906e+09  ...  2785906106   
2  SCQGQALXBUQZJ     SEO  Chrome   M   25  3.560567e+08  ...   356056736   
3  MSNWCFEHKTIOY     Ads  Safari   M   19  2.985180e+09  ...  2985180352   
4  FROZWSSWOHZBE  Direct      IE   M   32  5.783125e+08  ...   578312545   

   lower_bound_ip_address  upper_bound_ip_address        country hour_of_day  \
0            8.724152e+08            8.891

üíæ Save Processed Data

In [4]:

#üíæ Save Processed Data
fraud_df.to_csv("../data/processed/fraud_data_features.csv", index=False)