<a href="https://colab.research.google.com/github/ITU-Business-Analytics-Team/Business_Analytics_for_Professionals/blob/main/Part%20I%20%3A%20Methods%20%26%20Technologies%20for%20Business%20Analytics/Chapter%206%3A%20Feature%20Engineering/6_3_6_4_Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Feature Selection**
## Filtering & Wrapper

In [None]:
import pandas as pd
import numpy as np 

from sklearn.feature_selection import chi2,mutual_info_classif,SelectKBest,RFE,SelectFromModel

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,RidgeClassifier

from sklearn.model_selection import train_test_split

!pip install shap
import shap



In [None]:
url=   'https://docs.google.com/spreadsheets/d/1sRzCCzWZ5NfEduSHnLPEh8GbiPxuQson/edit?usp=sharing&ouid=108589602591644119588&rtpof=true&sd=true'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_excel(path)

In [None]:
df.head()

Unnamed: 0,Purpose,Loan Duration,Account Balance,Pension Funds,Account Length,Sex,Marriage Status,Age,House,Job,Employed Duration,Credit Risk
0,9,3,0,739,13,1,1,23,1,1,12,1
1,3,1,0,1230,25,1,2,32,1,3,0,1
2,5,4,0,389,19,1,1,38,1,2,119,1
3,3,2,638,347,13,1,1,36,1,1,14,0
4,2,3,963,4754,40,1,1,31,2,3,45,0


Random Forest is used as the prediction method.

In [None]:
#Prediction Function to test Feature Selection Methods
def predict(X_train, y_train, X_test, y_test, method, features):
    estimator = RandomForestClassifier(n_estimators = 50,random_state = 0,min_samples_split=2)
    estimator = estimator.fit(X_train[features], y_train.values.ravel())
    
    return {"Method": method, "Num_Features": len(features), "Features": ",".join(features), "Accuracy":estimator.score(X_test[features], y_test.values.ravel())}

All possible the number of features will be tested.

In [None]:
num_features = range(1,11)
result = pd.DataFrame(columns=["Method","Features","Accuracy"])

X = df.drop(columns=['Credit Risk'])
y = df[['Credit Risk']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Firstly, the result will be found using all the variables.

In [None]:
result = result.append(predict(X_train, y_train, X_test, y_test, "No Selection", X.columns.tolist()), ignore_index=True)

In [None]:
estimator = RandomForestClassifier(n_estimators = 50,random_state = 0,min_samples_split=2)
estimator = estimator.fit(X_train, y_train.values.ravel())
estimator.predict_proba(X_test.iloc[0:1])[0][1]

0.98

### **1. Filter Based Methods**

#### **Pearson Correlation**
Variables are ordered according to the absolute value of the coefficient between the variable and the target variable.

In [None]:
def correlation_selector(X,y,num_feats):
    cor_list = []
    cor_feature = X.columns.tolist()
    
    if len(cor_feature) != num_feats:
        # calculate the correlation with y for each feature
        for i in cor_feature:
            cor = np.corrcoef(X[i], y.values.ravel())[0, 1]
            cor_list.append(cor)
        # replace NaN with 0
        cor_list = [0 if np.isnan(i) else i for i in cor_list]
        # feature name
        cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
        
    return cor_feature

In [None]:
for n in num_features:
    cor_features = correlation_selector(X_train,y_train,n)
    result = result.append(predict(X_train, y_train, X_test, y_test, "Correlation", cor_features), ignore_index=True)

#### **Chi-Square Test**
The existence and degree of a relevance between the variables and the target variable are detected. 

In [None]:
def chiSquare_selector(X,y,num_feats):
    chi_feature = X.columns.tolist()
    if len(chi_feature) != num_feats:
        X_norm = MinMaxScaler().fit_transform(X)
        
        chi_selector = SelectKBest(chi2, k=num_feats)
        chi_selector.fit(X_norm, y.values.ravel())
        chi_support = chi_selector.get_support()
        chi_feature = X.loc[:,chi_support].columns.tolist()
    
    return chi_feature

In [None]:
for n in num_features:
    chi_features = chiSquare_selector(X_train,y_train,n)
    result = result.append(predict(X_train, y_train, X_test, y_test, "Chi-Square", chi_features), ignore_index=True)

#### **Mutual Information**
The quantity of knowledge that one variable knows about another can be determined by mutual information (Battiti, 1994). If the value found is 0, this indicates that these variables are unrelated to one an-other. 

In [None]:
def mutualInfo_selector(X,y,num_feats):
    mi_feature = X.columns.tolist()
    
    if len(mi_feature) != num_feats:
        
        mi_selector = SelectKBest(mutual_info_classif, k=num_feats)
        mi_selector.fit(X,y.values.ravel())
        mi_support = mi_selector.get_support()
        mi_feature = X.loc[:,mi_support].columns.tolist()
    
    return mi_feature

In [None]:
for n in num_features:
    mi_features = mutualInfo_selector(X_train,y_train,n)
    result = result.append(predict(X_train, y_train, X_test, y_test, "Mutual Info", mi_features), ignore_index=True)

### **2.	Embedded Methods**

#### **LASSO**
LASSO entails penalizing the coefficients' absolute values

In [None]:
def lasso_selector(X,y,num_feats):
    embeded_lr_feature = X.columns.tolist()
    
    if len(embeded_lr_feature) != num_feats:
        X_norm = MinMaxScaler().fit_transform(X)
        
        # We use Logistic Regression to perform L1 norm in a classification task
        embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1",solver='liblinear',multi_class='auto'),threshold=-np.inf, max_features=num_feats)
        embeded_lr_selector.fit(X_norm, y.values.ravel())
        
        embeded_lr_support = embeded_lr_selector.get_support()
        embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()

    return embeded_lr_feature

In [None]:
for n in num_features:
    lasso_features = lasso_selector(X_train,y_train,n)
    result = result.append(predict(X_train, y_train, X_test, y_test, "Lasso", lasso_features), ignore_index=True)

#### **Ridge**
The square of the coefficients is penalized in ridge. 

In [None]:
def ridge_selector(X,y,num_feats):
    ridge_feature = X.columns.tolist()
    
    if len(ridge_feature) != num_feats:
        X_norm = MinMaxScaler().fit_transform(X.apply(pd.to_numeric))
        
        #The classifier first converts the target values into {-1, 1} and then treats the problem as a regression task
        ridge_selector = SelectFromModel(RidgeClassifier(alpha=0.5,random_state=1),threshold=-np.inf, max_features=num_feats)
        
        ridge_selector.fit(X_norm, y.values.ravel())
        ridge_support = ridge_selector.get_support()
        ridge_feature = X.loc[:,ridge_support].columns.tolist()
    
    return ridge_feature

In [None]:
for n in num_features:
    ridge_features = ridge_selector(X_train,y_train,n)
    result = result.append(predict(X_train, y_train, X_test, y_test, "Ridge", ridge_features), ignore_index=True)

#### **Decision Tree Feature Importance**
Relative importance of features can be calculated by using gini index or information gain for classification problems, variance reduction for regression. Thus, it is calculated that how much the variable re-duces the variance or impurity in the data.

In [None]:
def tree_selector(X,y,num_feats):
    embeded_feature = X.columns.tolist()
    
    if len(embeded_feature) != num_feats:
        embeded_selector = SelectFromModel(RandomForestClassifier(n_estimators = 50,random_state = 1), threshold=-np.inf, max_features=num_feats)

        embeded_selector.fit(X, y.values.ravel())
        
        embeded_support = embeded_selector.get_support()
        embeded_feature = X.loc[:,embeded_support].columns.tolist()
    
    return embeded_feature

In [None]:
for n in num_features:
    tree_features = tree_selector(X_train,y_train,n)
    result = result.append(predict(X_train, y_train, X_test, y_test, "Tree", tree_features), ignore_index=True)

### **3. Wrapper Methods**

#### **Recursive Feature Elimination**
The recursive feature elimination is an iterative approach. After the predictive performance cal-culated using all variables, the weakest performance feature is eliminated at each iteration.

In [None]:
def recursive_selector(X,y,num_feats):
    rfe_feature = X.columns.tolist()
    
    if len(rfe_feature) != num_feats:
        estimator = RandomForestClassifier(n_estimators = 50,random_state = 1)
        
        rfe_selector = RFE(estimator=estimator, n_features_to_select=num_feats, step=10, verbose=0)
        rfe_selector.fit(X, y.values.ravel())
        rfe_support = rfe_selector.get_support()
        rfe_feature = X.loc[:,rfe_support].columns.tolist()
    
    return rfe_feature

In [None]:
for n in num_features:
    recursive_features = recursive_selector(X_train,y_train,n)
    result = result.append(predict(X_train, y_train, X_test, y_test, "Recursive", recursive_features), ignore_index=True)

#### **SHAP Values**
SHAP values is an alternative to the importance of permutation fea-tures. The relevance of the permutation feature is determined by the decline in model performance. The SHAP score is determined by the magnitude of feature attributions. The Shapley value is the average of all marginal contributions to all potential coalitions divided by the number of coalitions. 

In [None]:
def shap_selector(X,y,num_feats):
    shap_feature = X.columns.tolist()
    
    if len(shap_feature) != num_feats:
        estimator = RandomForestClassifier(n_estimators = 50 ,random_state = 1)
        
        estimator.fit(X, y.values.ravel())
        
        explainer = shap.TreeExplainer(estimator)
            
        shap_values = explainer.shap_values(X)
        
        s=abs(np.array(shap_values).transpose((1,0,2)).sum(1)).mean(axis=0)
        rankings = pd.DataFrame({'Variable':X.columns.tolist(),'SHAP_Value':s}).sort_values(by=['SHAP_Value'],ascending=False)
        shap_feature=rankings.iloc[:num_feats]['Variable'].tolist()
    
    return shap_feature

In [None]:
for n in num_features:
    shap_features = shap_selector(X_train,y_train,n)
    result = result.append(predict(X_train, y_train, X_test, y_test, "Shap", shap_features), ignore_index=True)

In [None]:
result = result[result.groupby(['Method'])['Accuracy'].transform(max) == result['Accuracy']].drop_duplicates(['Method'])
result

Unnamed: 0,Method,Features,Accuracy,Num_Features
0,No Selection,"Purpose,Loan Duration,Account Balance,Pension ...",0.917647,11.0
6,Correlation,"Loan Duration,Purpose,Marriage Status,Employed...",0.964706,6.0
16,Chi-Square,"Purpose,Loan Duration,Account Balance,Pension ...",0.988235,6.0
27,Mutual Info,"Purpose,Loan Duration,Account Balance,Pension ...",0.952941,7.0
37,Lasso,"Purpose,Account Balance,Pension Funds,Account ...",0.976471,7.0
47,Ridge,"Purpose,Account Balance,Pension Funds,Account ...",0.976471,7.0
60,Tree,"Purpose,Loan Duration,Account Balance,Pension ...",0.964706,10.0
70,Recursive,"Purpose,Loan Duration,Account Balance,Pension ...",0.964706,10.0
79,Shap,"Account Balance,Pension Funds,Age,Employed Dur...",0.964706,9.0


In [None]:
best = result.sort_values('Accuracy',ascending = False).head(1)
best

Unnamed: 0,Method,Features,Accuracy,Num_Features
16,Chi-Square,"Purpose,Loan Duration,Account Balance,Pension ...",0.988235,6.0


In [None]:
print(best.Features.values)

['Purpose,Loan Duration,Account Balance,Pension Funds,Marriage Status,Employed Duration']
