# üîç Feature Selection ‚Äî Fraud Detection Project

This notebook performs a complete feature selection workflow using:

‚úÖ Chi‚ÄëSquare Test (categorical features)  
‚úÖ ANOVA F‚Äëtest (numerical features)  
‚úÖ Mutual Information  
‚úÖ Correlation Filtering  
‚úÖ Model‚ÄëBased Feature Importance (ExtraTrees)  
‚úÖ Combined Feature Selection Scorecard  

Feature selection improves model performance, reduces noise, and enhances interpretability.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [2]:
df = pd.read_csv("../data/insurance_synthetic.csv")   # update path if needed
df.head()

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Occupation,Income Level,Education Level,Geographic Information,Location,Behavioral Data,...,Customer Preferences,Preferred Communication Channel,Preferred Contact Time,Preferred Language,Risk Profile,Previous Claims History,Credit Score,Driving Record,Life Events,Segmentation Group
0,84966,23,Female,Married,Entrepreneur,70541,Associate Degree,Mizoram,37534,policy5,...,Email,In-Person Meeting,Afternoon,English,1,3,728,DUI,Job Change,Segment5
1,95568,26,Male,Widowed,Manager,54168,Doctorate,Goa,63304,policy5,...,Mail,In-Person Meeting,Morning,French,1,2,792,Clean,Retirement,Segment5
2,10544,29,Female,Single,Entrepreneur,73899,Associate Degree,Rajasthan,53174,policy5,...,Email,Mail,Evening,German,2,1,719,Accident,Childbirth,Segment3
3,77033,20,Male,Divorced,Entrepreneur,63381,Bachelor's Degree,Sikkim,22803,policy5,...,Text,In-Person Meeting,Anytime,French,3,0,639,DUI,Job Change,Segment3
4,88160,25,Female,Separated,Manager,38794,Bachelor's Degree,West Bengal,92858,policy1,...,Email,Text,Weekends,English,0,3,720,Major Violations,Childbirth,Segment2


In [5]:
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

target_col = "Segmentation Group" 

numerical_cols, categorical_cols

(['Customer ID',
  'Age',
  'Income Level',
  'Location',
  'Claim History',
  'Coverage Amount',
  'Premium Amount',
  'Deductible',
  'Risk Profile',
  'Previous Claims History',
  'Credit Score'],
 ['Gender',
  'Marital Status',
  'Occupation',
  'Education Level',
  'Geographic Information',
  'Behavioral Data',
  'Purchase History',
  'Policy Start Date',
  'Policy Renewal Date',
  'Interactions with Customer Service',
  'Insurance Products Owned',
  'Policy Type',
  'Customer Preferences',
  'Preferred Communication Channel',
  'Preferred Contact Time',
  'Preferred Language',
  'Driving Record',
  'Life Events',
  'Segmentation Group'])

In [6]:
df_fs = df.copy()

# Encode categorical features
for col in categorical_cols:
    df_fs[col] = LabelEncoder().fit_transform(df_fs[col])

# Encode target if needed
df_fs[target_col] = LabelEncoder().fit_transform(df_fs[target_col])

X = df_fs.drop(target_col, axis=1)
y = df_fs[target_col]

X_num = df_fs[numerical_cols]
X_cat = df_fs[categorical_cols]