In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import dask.dataframe as dd
from geopy.distance import geodesic
import string
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind, skew

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

**🧠 Feature Selection Overview**

With a wide range of raw, engineered, and encoded features now available, the next critical step is **Feature Selection** — identifying which variables are truly valuable for predicting fraudulent transactions.

This stage helps to:

- Remove noisy or redundant features
- Improve model performance and interpretability
- Reduce training time and risk of overfitting

We’ll apply statistical tests and model-based techniques to rank feature importance and retain only the most informative predictors for the final fraud detection model.


In [30]:
dir = '../csv/'
fname = 'sample_300k_2020'

In [4]:
df_BASE = pd.read_pickle(fname + '_feat_eng_pruned.pkl')

In [8]:
df_sampled = df_BASE.sample(15000, random_state=42)
df_sampled = df_sampled.reset_index(drop=True)
df_sampled.to_csv('feat_sel_15k_sampled.csv', index=False)

In [6]:
print('df_BASE shape: ' + str(df_BASE.shape))
print(df_BASE.info())

df_BASE shape: (299996, 20)
<class 'pandas.core.frame.DataFrame'>
Index: 299996 entries, 70705 to 127819
Data columns (total 20 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   gender                        299996 non-null  object 
 1   unix_time                     299996 non-null  int32  
 2   category                      299996 non-null  int32  
 3   is_fraud                      299996 non-null  int32  
 4   region                        299996 non-null  object 
 5   trans_day_of_week             299996 non-null  int32  
 6   is_weekend                    299996 non-null  int32  
 7   trans_hour                    299996 non-null  int32  
 8   trans_time_segment            299996 non-null  object 
 9   age_group                     299996 non-null  int32  
 10  cc_type                       299996 non-null  object 
 11  area_cat                      299996 non-null  int32  
 12  distance         

In [7]:
df_feat_sel = df_BASE.copy()

In [15]:
df_feat_sel.nunique()

gender                               2
unix_time                       296925
category                            14
is_fraud                             2
region                               4
trans_day_of_week                    7
is_weekend                           2
trans_hour                          24
trans_time_segment                   4
age_group                            6
cc_type                              8
area_cat                             2
distance                        299996
job_cat                             17
log_amt                          30441
log_time_since_last_trans       265016
log_city_pop                      5880
trans_hour_x_is_weekend             48
category_x_trans_day_of_week        97
age_group_x_category                80
dtype: int64

*Encode string features* - required for the feature selection process

In [None]:
df_encoded = df_feat_sel.copy()
# Identify string (object) columns
string_cols = df_encoded.select_dtypes(include='object').columns.tolist()
print(string_cols)
df_encoded[string_cols].nunique()

['gender', 'region', 'trans_time_segment', 'cc_type', 'job_cat', 'trans_hour_x_is_weekend', 'category_x_trans_day_of_week', 'age_group_x_category']


gender                           2
region                           4
trans_time_segment               4
cc_type                          8
job_cat                         17
trans_hour_x_is_weekend         48
category_x_trans_day_of_week    97
age_group_x_category            80
dtype: int64

In [21]:
# Apply Label Encoding to string columns
label_encoders = {}

for col in string_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

df_encoded.info()
# Show encoded string columns
df_encoded[string_cols].head()

<class 'pandas.core.frame.DataFrame'>
Index: 299996 entries, 70705 to 127819
Data columns (total 20 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   gender                        299996 non-null  int64  
 1   unix_time                     299996 non-null  int32  
 2   category                      299996 non-null  int32  
 3   is_fraud                      299996 non-null  int32  
 4   region                        299996 non-null  int64  
 5   trans_day_of_week             299996 non-null  int32  
 6   is_weekend                    299996 non-null  int32  
 7   trans_hour                    299996 non-null  int32  
 8   trans_time_segment            299996 non-null  int64  
 9   age_group                     299996 non-null  int32  
 10  cc_type                       299996 non-null  int64  
 11  area_cat                      299996 non-null  int32  
 12  distance                      299996 non-null

Unnamed: 0,gender,region,trans_time_segment,cc_type,job_cat,trans_hour_x_is_weekend,category_x_trans_day_of_week,age_group_x_category
70705,0,0,1,6,10,28,55,47
69722,0,0,2,6,10,34,4,39
69411,0,0,2,6,10,33,40,40
70857,0,0,0,6,10,14,65,48
69729,0,0,1,6,10,20,65,48


In [22]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.linear_model import Ridge

In [23]:
y=df_encoded['is_fraud']
X = df_encoded.drop(columns=['is_fraud'])

In [26]:

#Fit models and determine if a feature is selected (1) or not (0)
lasso = Lasso(alpha=0.01).fit(X, y)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

#Fit Ridge model
ridge = Ridge(alpha=0.01).fit(X, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

svm = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
svm_selected = (np.abs(svm.coef_[0]) > 0).astype(int)

gb = GradientBoostingClassifier().fit(X, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)

rf = RandomForestClassifier().fit(X, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)

# Create a DataFrame to store results
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected, 
    'SVM': svm_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected
})

# Sum the number of selections for each feature
selection_df['Sum'] = selection_df[['Lasso', 'SVM', 'GradientBoost', 'RandomForest','Ridge']].sum(axis=1)

# Output the results
print(selection_df.sort_values('Sum', ascending=False))


                         Feature  Lasso  SVM  GradientBoost  RandomForest  \
2                       category      1    1              1             1   
17  category_x_trans_day_of_week      1    1              1             1   
16       trans_hour_x_is_weekend      1    1              1             1   
6                     trans_hour      1    1              1             1   
13                       log_amt      1    1              1             1   
1                      unix_time      1    1              1             1   
9                        cc_type      0    1              1             1   
11                      distance      0    1              1             1   
15                  log_city_pop      0    1              1             1   
14     log_time_since_last_trans      0    1              1             1   
12                       job_cat      0    1              1             1   
18          age_group_x_category      0    1              1             1   

In [31]:
selected_features = selection_df[selection_df['Sum'] >= 4]['Feature'].tolist()
print(selected_features)
print(len(selected_features))

#not including 'is_fraud' column
#['unix_time', 'category', 'region', 'trans_hour', 'trans_time_segment', 'cc_type', 'distance', 'job_cat', 'log_amt', 'log_time_since_last_trans', 'log_city_pop', 'trans_hour_x_is_weekend', 'category_x_trans_day_of_week', 'age_group_x_category']

['unix_time', 'category', 'region', 'trans_hour', 'trans_time_segment', 'cc_type', 'distance', 'job_cat', 'log_amt', 'log_time_since_last_trans', 'log_city_pop', 'trans_hour_x_is_weekend', 'category_x_trans_day_of_week', 'age_group_x_category']
14


**🧠 Feature Selection Consensus Across Models**

We compared feature importance across five different models — **Lasso**, **Linear SVM**, **Gradient Boosting**, **Random Forest**, and **Ridge** — using binary indicators (`1` = selected, `0` = not selected). The goal is to identify features that are consistently valuable for fraud prediction.

---

**🏆 Universally Selected Features (5/5 Models)**  
These features were selected by **all five models**, indicating **high and stable predictive power**:

- `category`
- `category_x_trans_day_of_week`
- `trans_hour_x_is_weekend`
- `trans_hour`
- `log_amt`
- `unix_time`

✅ These are strong candidates for core features in the final model.

---

**🔶 Strong Features (Selected by 4 Models)**  
Selected by **4/5 models**, these features also show reliable importance:

- `cc_type`
- `distance`
- `log_city_pop`
- `log_time_since_last_trans`
- `job_cat`
- `age_group_x_category`
- `trans_time_segment`
- `region`

➡️ These should be retained and explored further during modeling.

---

**⚖️ Moderately Important (Selected by 3 Models)**  
- `age_group`
- `trans_day_of_week`
- `gender`

🔍 May add value depending on the model type or in combination with other features.

---

**📉 Lower Agreement (Selected by 2 Models)**  
- `area_cat`
- `is_weekend`

These features show lower consensus and may have **marginal standalone value**, but could still be useful in interactions or for specific fraud patterns.

---

**✅ Conclusion**

- Focus your core feature set around features selected with a **selection concensus by equal or more than 4 models**.
- Retain the next-tier predictors for further experimentation or model-specific tuning.


In [32]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299996 entries, 70705 to 127819
Data columns (total 20 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   gender                        299996 non-null  int64  
 1   unix_time                     299996 non-null  int32  
 2   category                      299996 non-null  int32  
 3   is_fraud                      299996 non-null  int32  
 4   region                        299996 non-null  int64  
 5   trans_day_of_week             299996 non-null  int32  
 6   is_weekend                    299996 non-null  int32  
 7   trans_hour                    299996 non-null  int32  
 8   trans_time_segment            299996 non-null  int64  
 9   age_group                     299996 non-null  int32  
 10  cc_type                       299996 non-null  int64  
 11  area_cat                      299996 non-null  int32  
 12  distance                      299996 non-null

**📦 Find Columns suitable for One-Hot Encoding**

no need, 

In [47]:
# Identify object columns with low cardinality suitable for one-hot encoding
def identify_1hot_candidates(df):
    one_hot_candidates = [
        col for col in df.columns
        if df[col].dtype == 'object' and df[col].nunique() <= 10
    ]

    # Create a summary of these columns with their unique value counts
    one_hot_summary = {
        'column': one_hot_candidates,
        'unique_values': [df[col].nunique() for col in one_hot_candidates]
    }

    print(pd.DataFrame(one_hot_summary).sort_values(by='unique_values', ascending=False))
    
    return one_hot_candidates

**📦 Found Columns suitable for One-Hot Encoding**

These categorical features have a small number of unique values and are ideal candidates for one-hot encoding:

| Column               | Unique Values |
|----------------------|----------------|
| `cc_type`            | 8              |
| `region`             | 4              |
| `trans_time_segment` | 4              |
| `gender`             | 2              |


In [None]:
def one_hot_encode(df, cols):
    return pd.get_dummies(df, columns=cols, drop_first=False)
    # Perform one-hot encoding 
    #df_feat_eng = pd.get_dummies(df_feat_eng, columns=one_hot_candidates, drop_first=true)
    #df_feat_eng.head()

In [52]:
df_4_model_selected_feat = df_encoded[selected_features + ['is_fraud']].copy()
df_4_model_selected_feat.nunique()

# no needfor 1hot encoding as all selected features are numerical
#print(df_4_model_selected_feat.info())
#one_hot_candidates = identify_1hot_candidates(df_4_model_selected_feat)

df_4_model_selected_feat.to_pickle(fname + '_enc_feat_selected.pkl')


In [50]:
df_4_model_full_feat = df_encoded.copy()
df_4_model_full_feat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299996 entries, 70705 to 127819
Data columns (total 20 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   gender                        299996 non-null  int64  
 1   unix_time                     299996 non-null  int32  
 2   category                      299996 non-null  int32  
 3   is_fraud                      299996 non-null  int32  
 4   region                        299996 non-null  int64  
 5   trans_day_of_week             299996 non-null  int32  
 6   is_weekend                    299996 non-null  int32  
 7   trans_hour                    299996 non-null  int32  
 8   trans_time_segment            299996 non-null  int64  
 9   age_group                     299996 non-null  int32  
 10  cc_type                       299996 non-null  int64  
 11  area_cat                      299996 non-null  int32  
 12  distance                      299996 non-null