In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import dask.dataframe as dd
from geopy.distance import geodesic
import string
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind, skew

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [100]:
dir = '../csv/'
fname = 'sample_300k_2020'

In [101]:
df_BASE = pd.read_pickle(fname + '_cleansed.pkl')

In [66]:
df_sampled = df_BASE.sample(15000, random_state=42)
df_sampled = df_sampled.reset_index(drop=True)
df_sampled.to_csv('feat_eng_15k_sampled.csv', index=False)

In [102]:
print('df_BASE shape: ' + str(df_BASE.shape))
print(df_BASE.info())

df_BASE shape: (299996, 37)
<class 'pandas.core.frame.DataFrame'>
Index: 299996 entries, 70705 to 127819
Data columns (total 37 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   cc_num                     299996 non-null  int64         
 1   gender                     299996 non-null  object        
 2   city                       299996 non-null  object        
 3   state                      299996 non-null  object        
 4   zip                        299996 non-null  int32         
 5   lat                        299996 non-null  float64       
 6   long                       299996 non-null  float64       
 7   city_pop                   299996 non-null  int64         
 8   job                        299996 non-null  object        
 9   dob                        299996 non-null  datetime64[ns]
 10  acct_num                   299996 non-null  int64         
 11  profile                  

**🛠️ Feature Engineering Overview**

Feature engineering is a critical step in preparing our dataset for fraud detection modeling. It involves transforming raw data into meaningful features that better expose the underlying patterns of fraudulent behavior.

****Many of the suggested features bellow were already created in the preparation stage to be analyzed in the EDA stage, nevertheles they are already present.***

In this stage, we will:

- Derive new features from existing columns (e.g., time-based, spatial, demographic)
- Normalize or encode variables for better model compatibility
- Create fraud-relevant signals such as:
  - Transaction distance (`user → merchant`) 
  - Log-transformed amount and population metrics
  - Temporal features like `hour`, `day of week`, and `is_weekend`
  - Categorical groupings and flags for rare or high-risk categories
  - Feature interraction like `trans_hours` with `is_weekend` or `category`

The goal is to enrich the dataset with high-signal, low-noise variables that will empower our model to better distinguish between legitimate and fraudulent transactions.


In [103]:
import folium
from folium import plugins
from IPython.display import display

---
✅ Show to an interactive map showing fraud transactions:

🔵 Blue circles: User location (lat, long)

🟢 Green circles: Merchant location (merch_lat, merch_long)

🔴 Red lines: Connection between user and merchant, fraudulent in Red, legit in Black. 

--- 

In [107]:

def display_trans_map(fraud_df, color_line='black'):
    # Initialize the map centered around the average fraud location
    center_lat = fraud_df['lat'].mean()
    center_long = fraud_df['long'].mean()
    fraud_map = folium.Map(location=[center_lat, center_long], zoom_start=6, tiles='CartoDB positron')

    # Add markers and connecting lines between user and merchant locations
    for _, row in fraud_df.iterrows():
        user_loc = (row['lat'], row['long'])
        merch_loc = (row['merch_lat'], row['merch_long'])

        # Add line between customer and merchant
        #folium.PolyLine(locations=[user_loc, merch_loc], color='red', weight=1).add_to(fraud_map)
        folium.PolyLine(locations=[user_loc, merch_loc], color=color_line, weight=1).add_to(fraud_map)

        # Add markers
        folium.CircleMarker(location=user_loc, radius=2, color='blue', fill=True, fill_opacity=0.7).add_to(fraud_map)
        folium.CircleMarker(location=merch_loc, radius=2, color='green', fill=True, fill_opacity=0.7).add_to(fraud_map)

    # Display the map
    fraud_map
    # Create a folium map centered at the average location of all fraud points
    map_center = [fraud_df['lat'].mean(), fraud_df['long'].mean()]
    fraud_map = folium.Map(location=map_center, zoom_start=6)

    # Add lines and markers for each fraud transaction (limited for performance)
    max_points = 300  # Limit to avoid lag
    for _, row in fraud_df.head(max_points).iterrows():
        user_loc = (row['lat'], row['long'])
        merchant_loc = (row['merch_lat'], row['merch_long'])

        # Add a line connecting the user and merchant
        folium.PolyLine(locations=[user_loc, merchant_loc], color=color_line, weight=2, opacity=0.6).add_to(fraud_map)

        # Optional: Add markers (commented for cleaner map)
        folium.CircleMarker(user_loc, radius=3, color='blue', fill=True, fill_opacity=0.7).add_to(fraud_map)
        folium.CircleMarker(merchant_loc, radius=3, color='green', fill=True, fill_opacity=0.7).add_to(fraud_map)

    return fraud_map


# Display the map
#fraud_map

In [108]:
# Filter fraud transactions only
trans_fraud_df = df_BASE[df_BASE['is_fraud'] == 1].copy()
fraud_map = display_trans_map(trans_fraud_df, 'red')
fraud_map



In [109]:
# Filter not fraud transactions only
trans_not_fraud_df = df_BASE[df_BASE['is_fraud'] == 0].copy()
not_fraud_map = display_trans_map(trans_not_fraud_df)
not_fraud_map

**🗺️ Insights from Fraud Transaction Map**

An interactive geographic map was used to visualize fraud transactions by connecting the **user location (`lat`, `long`)** and **merchant location (`merch_lat`, `merch_long`)**.

---

**📌 Key Observations**

- **📏 Long-distance transactions:**  
  Many fraud transactions span large geographic distances — sometimes across states or even coasts. This suggests potential **card-not-present fraud**, **identity theft**, or the use of **stolen credentials**.

- **🏙️ Merchant clustering vs. user dispersion:**  
  Merchant locations (🟢 green markers) tend to cluster around urban centers, while user locations (🔵 blue markers) are more spread out. Fraudsters may be targeting **centralized, high-volume merchants**.

- **📍 Lack of proximity:**  
  Legitimate transactions typically happen close to a user's location. The presence of **long-range purchases** may serve as a useful fraud indicator.

---

**🧠 Modeling Implications**

- Consider engineering features such as:
  - `distance_km` between user and merchant
  - `is_local_transaction` (binary indicator)
- These spatial features could provide **strong predictive power** in fraud detection models.

> 🚨 **Conclusion:** Spatial behavior — particularly **distance** between cardholder and merchant — is a **valuable dimension** in identifying potentially fraudulent activity.


In [71]:
#implementing the conclusions from the map above
df_feat_eng = df_BASE.copy()


df_feat_eng['distance'].describe()


count    299996.000000
mean         76.509002
std          29.203572
min           0.061514
25%          55.620571
50%          78.718629
75%          98.924832
max         148.455852
Name: distance, dtype: float64

In [72]:
df_feat_eng.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299996 entries, 70705 to 127819
Data columns (total 37 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   cc_num                     299996 non-null  int64         
 1   gender                     299996 non-null  object        
 2   city                       299996 non-null  object        
 3   state                      299996 non-null  object        
 4   zip                        299996 non-null  int32         
 5   lat                        299996 non-null  float64       
 6   long                       299996 non-null  float64       
 7   city_pop                   299996 non-null  int64         
 8   job                        299996 non-null  object        
 9   dob                        299996 non-null  datetime64[ns]
 10  acct_num                   299996 non-null  int64         
 11  profile                    299996 non-null  object   

In [73]:
df_feat_eng.nunique()

cc_num                        17591
gender                            2
city                           4920
state                            51
zip                            9561
lat                            9251
long                           9427
city_pop                       5880
job                             639
dob                           12588
acct_num                      17591
profile                          12
trans_num                    299996
trans_date                      366
trans_time                    58186
unix_time                    296925
category                         14
amt                           30441
is_fraud                          2
merchant                        635
merch_lat                    297327
merch_long                   298822
region                            4
trans_day_of_week                 7
is_weekend                        2
trans_hour                       24
trans_time_segment                4
age                         

In [74]:
print(df_feat_eng['cc_type'].value_counts())
print(df_feat_eng['age_group'].value_counts())
print(df_feat_eng['distance'].value_counts())
print(df_feat_eng['city_pop'].value_counts())


cc_type
Visa                99424
Maestro             49429
Unknown             43612
JCB                 29996
Diners_Club         28207
MasterCard          24486
American_Express    24377
Switch                465
Name: count, dtype: int64
age_group
young_adult    129142
middle_aged     95291
senior          39566
teenager        21476
elderly         11980
child            2541
Name: count, dtype: int64
distance
92.807059     1
54.802806     1
56.228604     1
96.637555     1
105.608287    1
             ..
119.693429    1
63.477803     1
83.689448     1
62.396997     1
19.101638     1
Name: count, Length: 299996, dtype: int64
city_pop
2906700    3010
2680484    2961
2504700    2393
2383912    2313
1417793    1792
           ... 
3877          1
17550         1
9647          1
3662          1
5928          1
Name: count, Length: 5880, dtype: int64


**🔁 Create Feature Interactions for Fraud Prediction**

Combining features can help uncover complex behavioral patterns associated with fraudulent activity. Below are recommended interactions for this dataset:

---

**🕒 Temporal Interactions**
- `trans_hour` × `is_weekend`: Detects unusual weekend-night transaction behavior.
- `trans_day_of_week` × `category`: Reveals category-specific patterns by day (e.g., shopping on weekdays vs weekends).
- `trans_hour` × `distance`: Flags transactions made at odd hours that involve faraway merchants.

---

**💳 Transactional Behavior Interactions**
- `amt` × `distance`: Large amount + far distance = higher fraud risk.
- `log_amt` × `log_city_pop`: Detects if high-amount transactions are unusually common in low-population areas.
- `amt` × `category`: Some categories may have higher normal transaction amounts — this interaction helps differentiate.

---

**🌍 Geospatial Interactions**
- `distance` × `is_weekend`: Long-distance weekend transactions may suggest leisure (normal) or travel fraud (risky).
- `region` × `category`: Some transaction types may be riskier in certain regions (e.g., gas stations in remote areas).
- `area_cat` × `distance`: Rural users making distant transactions might be more or less typical depending on context.

---

**👤 Demographic & Behavior**
- `age_group` × `category`: Certain age groups may exhibit distinct fraud patterns for specific transaction types.
- `job_cat` × `trans_hour`: Work schedules may influence typical transaction times — deviations could be suspicious.
- `cc_type` × `amt`: Fraud detection may vary across card types for specific amount thresholds.

---

**✅ Implementation**
- Use `pd.Series.astype(str)` + concatenation for categorical interactions (then encode).
- Drop interaction features with cardinality > 100.


In [75]:
from sklearn.preprocessing import LabelEncoder

In [76]:

# Define interaction feature pairs
cat_interactions = [
    ('trans_hour', 'is_weekend'),
    ('category', 'trans_day_of_week'),
    ('area_cat', 'distance'),
    ('age_group', 'category'),
]

num_interactions = [
    ('amt', 'distance'),
    ('log_amt', 'log_city_pop'),
]

label_encoders = {}
for col1, col2 in cat_interactions:
    for col in [col1, col2]:
        if df_feat_eng[col].dtype == 'object':
            le = LabelEncoder()
            df_feat_eng[col] = le.fit_transform(df_feat_eng[col])
            label_encoders[col] = le

# Create interaction features
for col1, col2 in cat_interactions:
    new_col = f"{col1}_x_{col2}"
    df_feat_eng[new_col] = df_feat_eng[col1].astype(str) + "_" + df_feat_eng[col2].astype(str)

for col1, col2 in num_interactions:
    new_col = f"{col1}_x_{col2}"
    df_feat_eng[new_col] = df_feat_eng[col1] * df_feat_eng[col2]

# Show sample of new interaction columns
interaction_cols = [f"{c1}_x_{c2}" for c1, c2 in cat_interactions + num_interactions]


In [77]:
print(df_feat_eng[interaction_cols].nunique())
df_feat_eng[interaction_cols].head()

trans_hour_x_is_weekend             48
category_x_trans_day_of_week        97
area_cat_x_distance             299996
age_group_x_category                80
amt_x_distance                  299996
log_amt_x_log_city_pop          293959
dtype: int64


Unnamed: 0,trans_hour_x_is_weekend,category_x_trans_day_of_week,area_cat_x_distance,age_group_x_category,amt_x_distance,log_amt_x_log_city_pop
70705,22_0,4_0,1_92.80705938681892,3_4,779.579299,20.071181
69722,3_0,0_4,1_111.94423736183002,3_0,570.915611,16.197766
69411,2_1,1_5,1_98.15977754844715,3_1,9196.589559,40.762123
70857,16_0,5_3,1_41.5992381637844,3_5,3246.820539,39.145043
69729,19_0,5_3,1_83.87938234115025,3_5,1045.137104,23.28704


In [79]:
# Drop interaction features with cardinality > 100  
high_cardinality_cols = [col for col in interaction_cols if df_feat_eng[col].nunique() > 100]
print(high_cardinality_cols)
df_feat_eng = df_feat_eng.drop(columns=high_cardinality_cols)

['area_cat_x_distance', 'amt_x_distance', 'log_amt_x_log_city_pop']


In [80]:
df_feat_eng.head()

Unnamed: 0,cc_num,gender,city,state,zip,lat,long,city_pop,job,dob,...,area_cat,distance,time_since_last_trans,job_cat,log_amt,log_time_since_last_trans,log_city_pop,trans_hour_x_is_weekend,category_x_trans_day_of_week,age_group_x_category
70705,3510694985435665,f,sidney,NE,69162,41.138,-102.9856,7765,energy_manager,1941-05-26,...,1,92.807059,0.0,Management,2.24071,0.0,8.957511,22_0,4_0,3_4
69722,3510694985435665,f,sidney,NE,69162,41.138,-102.9856,7765,energy_manager,1941-05-26,...,1,111.944237,76.854444,Management,1.808289,4.354841,8.957511,3_0,0_4,3_0
69411,3510694985435665,f,sidney,NE,69162,41.138,-102.9856,7765,energy_manager,1941-05-26,...,1,98.159778,1198.525278,Management,4.550608,7.089681,8.957511,2_1,1_5,3_1
70857,3510694985435665,f,sidney,NE,69162,41.138,-102.9856,7765,energy_manager,1941-05-26,...,1,41.599238,638.237778,Management,4.370081,6.460276,8.957511,16_0,5_3,3_5
69729,3510694985435665,f,sidney,NE,69162,41.138,-102.9856,7765,energy_manager,1941-05-26,...,1,83.879382,170.932222,Management,2.599722,5.1471,8.957511,19_0,5_3,3_5


In [82]:
df_feat_eng.info()  

<class 'pandas.core.frame.DataFrame'>
Index: 299996 entries, 70705 to 127819
Data columns (total 40 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   cc_num                        299996 non-null  int64         
 1   gender                        299996 non-null  object        
 2   city                          299996 non-null  object        
 3   state                         299996 non-null  object        
 4   zip                           299996 non-null  int32         
 5   lat                           299996 non-null  float64       
 6   long                          299996 non-null  float64       
 7   city_pop                      299996 non-null  int64         
 8   job                           299996 non-null  object        
 9   dob                           299996 non-null  datetime64[ns]
 10  acct_num                      299996 non-null  int64         
 11  profile       

**🧹 Feature Pruning, Columns to Drop**

These columns may be dropped to reduce dimensionality, avoid data leakage, and improve model performance.

---

**🔑 Identifiers**  
Uniquely identify users or transactions — not useful for prediction:
- `cc_num`, `acct_num`, `trans_num`, `profile`

---

**📅 Date/Time (likely already transformed)**  
Can be replaced by derived features like `trans_hour`, `age`, etc.:
- `trans_date`, `trans_time`, `dob`

---

**🔁 Raw Features (Log-transformed versions used instead)**  
Replaced by less-skewed versions like `log_amt`, `log_city_pop`, etc.:
- `amt`, `city_pop`, `time_since_last_trans`

---

**⚠️ High-Cardinality Object Columns**  
Could create sparse/dense encodings and lead to overfitting:
- `merchant`, `job`, `city`, `trans_num`, `trans_date`, `trans_time`, `dob`

> These can be transformed or encoded if critical, but are best avoided in early modeling phases.

---

Let me know if you’d like help with the actual `.drop()` operation or saving the pruned dataset!


In [83]:
# high cardinality columns
high_card_obj = [col for col in df_feat_eng.columns if df_feat_eng[col].dtype == 'object' and df_feat_eng[col].nunique() > 100]
high_card_obj

['city', 'job', 'trans_num', 'merchant']

In [84]:
# Date/time columns were already transformed
datetime_cols = ['trans_date', 'trans_time', 'dob']

In [85]:
# Redundant raw features already log-transformed
redundant_cols = ['amt', 'city_pop', 'time_since_last_trans']

In [86]:
# Identify id-like columns
identifier_cols = ['cc_num', 'acct_num', 'trans_num']

In [87]:
# Extracted from columns
'''
'first', 'last', 
'street', 'city', 'state', 'zip' => geographical data, 
'lat', 'long', 'merch_lat', 'merch_long' => distance data
'profile' => area urban|rural
'age' => age group
'''
extracted_cols = ['city', 'state', 'zip', 'lat', 'long', 'merch_lat', 'merch_long', 'profile', 'age']

In [88]:
suggested_drop_cols = list(set(high_card_obj + identifier_cols + datetime_cols + redundant_cols + extracted_cols))
suggested_drop_cols

['dob',
 'acct_num',
 'state',
 'merchant',
 'trans_num',
 'profile',
 'lat',
 'time_since_last_trans',
 'age',
 'amt',
 'city_pop',
 'job',
 'city',
 'trans_time',
 'long',
 'merch_lat',
 'trans_date',
 'zip',
 'merch_long',
 'cc_num']

In [89]:
df_feat_eng = df_feat_eng.drop(columns=suggested_drop_cols)

In [90]:
df_feat_eng.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299996 entries, 70705 to 127819
Data columns (total 20 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   gender                        299996 non-null  object 
 1   unix_time                     299996 non-null  int32  
 2   category                      299996 non-null  int32  
 3   is_fraud                      299996 non-null  int32  
 4   region                        299996 non-null  object 
 5   trans_day_of_week             299996 non-null  int32  
 6   is_weekend                    299996 non-null  int32  
 7   trans_hour                    299996 non-null  int32  
 8   trans_time_segment            299996 non-null  object 
 9   age_group                     299996 non-null  int32  
 10  cc_type                       299996 non-null  object 
 11  area_cat                      299996 non-null  int32  
 12  distance                      299996 non-null

In [93]:
print(df_feat_eng.shape)
df_feat_eng.nunique()

(299996, 20)


gender                               2
unix_time                       296925
category                            14
is_fraud                             2
region                               4
trans_day_of_week                    7
is_weekend                           2
trans_hour                          24
trans_time_segment                   4
age_group                            6
cc_type                              8
area_cat                             2
distance                        299996
job_cat                             17
log_amt                          30441
log_time_since_last_trans       265016
log_city_pop                      5880
trans_hour_x_is_weekend             48
category_x_trans_day_of_week        97
age_group_x_category                80
dtype: int64

In [97]:
df_feat_eng.to_pickle(fname + '_feat_eng_pruned.pkl')