In [54]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler

from sklearn.compose import ColumnTransformer

In [56]:
data = pd.read_csv(r"C:\Users\infan\train.csv")

In [57]:
data.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [58]:
data.shape

(381109, 12)

In [62]:
# classify numerical and categorical columns
num_col =[]
cat_col = []
target_col = ['Response']
ignore_col = ['id']
for col in data.columns:
    if col not in (target_col+ignore_col):
        if data[col].dtypes == 'object':
            cat_col.append(col)
        else:
            num_col.append(col)

In [64]:
cat_col_encode = Pipeline(steps=[
    ("one_hot_encoding",OneHotEncoder(handle_unknown='ignore'))
])
num_col_encode = Pipeline(steps=[
    ("MinMaxScaler",MinMaxScaler())
])
preprocess=ColumnTransformer(
    transformers=[
        ('cat_encode',cat_col_encode,cat_col),
        ('num_encode',num_col_encode,num_col)
    ]
)

In [68]:
X= data.drop(target_col+ignore_col , axis=1)
y = data['Response']

# Featureselection
## supervised Learning 
### ---> We are having three methods i) Filter ii)wrapper iii)Embedded method

## filter 
### ---> In filter method we have four techniques . chi-square , mutual information , fisher score , missing value ratio , variance threshold , 

## Chi-square

In [73]:
#Chi-square
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [145]:
#chi_square accept only numeric and non-negative data 

#encode categorical columns and numerical columns
transform_data = preprocess.fit_transform(X)

#finding the best feature in numerical column alone
numerical_data = transform_data[: , -len(num_col):]

#finding the best feature
chi_best = SelectKBest(score_func=chi2 , k=3)
k_best=chi_best.fit(numerical_data,y)

#select the best indices
select_feature_indices = chi_best.get_support(indices=True)

#select the feature
select_feature = [num_col[i] for i in select_feature_indices]

#select the scores
select_score = k_best.scores_[select_feature_indices]

np.set_printoptions(precision=3)
for feature, score in zip(select_feature,select_score):
    print(f"{feature} : {score:.3f}")

Age : 925.900
Previously_Insured : 24033.828
Policy_Sales_Channel : 1203.448


## mutual information

In [164]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest

selector = SelectKBest(mutual_info_classif,k=3)

mul_info = selector.fit(numerical_data,y)

In [168]:
select_feature_indices = selector.get_support(indices=True)
select_feature = [num_col[i] for i in select_feature_indices]
select_score = mul_info.scores_[select_feature_indices]

np.set_printoptions(precision=3)
for feature, score in zip(select_feature,select_score):
    print(f"{feature} : {score:.3f}")

Driving_License : 0.073
Previously_Insured : 0.132
Policy_Sales_Channel : 0.058


## variance threshold

In [193]:
from sklearn.feature_selection import VarianceThreshold

# Initialize VarianceThreshold with a custom threshold (default = 0.0)
var_thresh = VarianceThreshold(threshold=0.0)

# Fit and transform the numerical data
selected_data = var_thresh.fit_transform(numerical_data)

# Get the variances of all features
variances = var_thresh.variances_

# Sort the indices of the features by variance in descending order
top_indices = np.argsort(variances)[-3:][::-1]  # Select top 3 features

# Get the corresponding feature names
select_feature = [num_col[i] for i in top_indices]

print("Selected Features:", select_feature)


Selected Features: ['Previously_Insured', 'Policy_Sales_Channel', 'Vintage']


## ANOVA test

In [196]:
from sklearn.feature_selection import f_classif
selector= SelectKBest(f_classif,k=3)
f_class = selector.fit(numerical_data,y)

select_feature_indices = selector.get_support(indices=True)
select_feature = [num_col[i] for i in select_feature_indices]
select_score = mul_info.scores_[select_feature_indices]

np.set_printoptions(precision=3)
for feature, score in zip(select_feature,select_score):
    print(f"{feature} : {score:.3f}")

Age : 0.032
Previously_Insured : 0.132
Policy_Sales_Channel : 0.058


## wrapper 
### ---> In wrapper method we have four techniques . Forward-selection , Backward-selection , Exhaustive feature selection , Recursive feature selection 

## Recursive feature selection

In [151]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

In [97]:
model = DecisionTreeClassifier()

transformed_data = preprocess.fit_transform(X)
numerical_data = transformed_data[: , -len(num_col):]

recursive_feature = RFE(estimator=model,n_features_to_select=3)
feature = recursive_feature.fit(numerical_data,y)

print(feature.n_features_)

3


In [98]:
print(feature.support_)
print(feature.ranking_)

[ True False False False  True False  True]
[1 5 3 2 1 4 1]


In [99]:
X.columns

Index(['Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage'],
      dtype='object')

In [100]:
for col,val in zip(X.columns, feature.support_):
    print(col , val)

Gender True
Age False
Driving_License False
Region_Code False
Previously_Insured True
Vehicle_Age False
Vehicle_Damage True


## Embedded method 
### ---> In Embedded method we have four techniques .  Tree methods , regularization methods

##   regularization methods

In [143]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectFromModel

# Step 1: Preprocess the data
transformed_data = preprocess.fit_transform(X)
numerical_data = transformed_data[: , -len(num_col):]

# Step 2: Fit the Ridge regression model
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(numerical_data, y)

# Step 3: Use SelectFromModel for feature selection
selector = SelectFromModel(estimator=ridge_reg, prefit=True, threshold="mean")
x_selected = selector.transform(numerical_data)

# Step 4: Get feature names
selected_feature_indices = selector.get_support(indices=True)
selected_features = np.array(num_col)[selected_feature_indices]

print("Selected Features:", selected_features)


Selected Features: ['Driving_License' 'Previously_Insured' 'Annual_Premium']


## tree based model

In [204]:
from sklearn.ensemble import RandomForestClassifier

rf= RandomForestClassifier()
rf.fit(numerical_data,y)

feature_importance = rf.feature_importances_


In [214]:
# Pair each feature with its importance
feature_importance_pairs = list(zip(num_col, feature_importance))

# Sort the features by importance in descending order
feature_importance_pairs.sort(key=lambda x: x[1], reverse=True)

# Get the top 3 most important features
top_3_features = feature_importance_pairs[:3]

# Print the top 3 features and their importance scores
for feature, importance in top_3_features:
    print(f"{feature}: {importance:.3f}")


Vintage: 0.307
Annual_Premium: 0.282
Age: 0.147
