<img src = https://www.evalueserve.com/wp-content/themes/wpapp/assets/img/logo.png width= 200 align = right>
<img src=https://www.python.org/static/community_logos/python-logo.png width= 200 align= left>

<h1><center> Kaggle: Santander Customer Satisfaction </center></h1>
<h3><center> Section: Feature Engineering </center></h3>
<h3><center> Author: Juan Carlos Urrutia </center></h3>
<h3><center> Date: July 2021 </center></h3>

### Feature Engineering ###

By using Python Pipelines (from sklearn), we will follow the next procedure:

    + 1.- Data processing with Standard Scaler
    + 2.- Reduce dimmension by removing ID and zero variance features
    + 3.- Reduce dimmensionality by using Principal Component Analysis (PCA)
    + 4.- Apply classifier
    + 5.- Comments: Evaluate results and possible enhancements

    

### 1.- Retrieving Libraries and Data ###

In [1]:
# Base libraries
import numpy as np
import pandas as pd
import os
import time

# Wrappers
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#Visual Resources
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, FunctionTransformer

# Training and validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Modelling
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost.sklearn import XGBClassifier

#Scoring
from sklearn.metrics import roc_auc_score

In [4]:
raw_df = pd.read_csv('/home/shared/santander/EVS_KAGGLE_TEAM/santander-customer-satisfaction/data/train.csv')
raw_df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


### 2.- Data Pre-processing

#### 2.1.- Deleting the ID column

In [5]:
list(raw_df.columns)

['ID',
 'var3',
 'var15',
 'imp_ent_var16_ult1',
 'imp_op_var39_comer_ult1',
 'imp_op_var39_comer_ult3',
 'imp_op_var40_comer_ult1',
 'imp_op_var40_comer_ult3',
 'imp_op_var40_efect_ult1',
 'imp_op_var40_efect_ult3',
 'imp_op_var40_ult1',
 'imp_op_var41_comer_ult1',
 'imp_op_var41_comer_ult3',
 'imp_op_var41_efect_ult1',
 'imp_op_var41_efect_ult3',
 'imp_op_var41_ult1',
 'imp_op_var39_efect_ult1',
 'imp_op_var39_efect_ult3',
 'imp_op_var39_ult1',
 'imp_sal_var16_ult1',
 'ind_var1_0',
 'ind_var1',
 'ind_var2_0',
 'ind_var2',
 'ind_var5_0',
 'ind_var5',
 'ind_var6_0',
 'ind_var6',
 'ind_var8_0',
 'ind_var8',
 'ind_var12_0',
 'ind_var12',
 'ind_var13_0',
 'ind_var13_corto_0',
 'ind_var13_corto',
 'ind_var13_largo_0',
 'ind_var13_largo',
 'ind_var13_medio_0',
 'ind_var13_medio',
 'ind_var13',
 'ind_var14_0',
 'ind_var14',
 'ind_var17_0',
 'ind_var17',
 'ind_var18_0',
 'ind_var18',
 'ind_var19',
 'ind_var20_0',
 'ind_var20',
 'ind_var24_0',
 'ind_var24',
 'ind_var25_cte',
 'ind_var26_0',
 '

In [6]:
df = raw_df.drop('ID', axis = 1)
df.head()

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


#### 2.2.- Remove features with a correlation above selected threshold

In [None]:
corr = df.corr()
plt.figure(figsize = (12,10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.6)],
           cmap = 'viridis' = vmax = 1.0, vmin = 1.0, linewidths = 0.05,
           annot = True, annot_kws = {'size': 2}, square)

In [21]:
# 5.- Correlation function
def corr_feature_sel(corr_df, threshold):
    columns = np.full(corr_df.shape[0], True, dtype=bool)

    for i in range(corr_df.shape[0]):
        for j in range(i+1,corr_df.shape[0]):
            if corr_df.iloc[i,j] >= threshold:
                if columns[j]:
                    columns[j]=False
    return columns

In [22]:
# 6.- Apply function to corrMatrix and choose a threshold
columns = corr_feature_sel(df,.6)
selColumns = df.columns[columns] # Selected features
newX = df[selColumns] # X subset with selected features.

IndexError: single positional indexer is out-of-bounds

#### 2.3.- Remove features with a variance under selected threshold

In [7]:
threshold = 0.01
selector = VarianceThreshold(threshold = threshold)
selector.fit(df)

variances = selector.variances_
dropped_features = df.columns.values[variances < threshold]
print(dropped_features)


['ind_var1' 'ind_var2_0' 'ind_var2' 'ind_var6_0' 'ind_var6'
 'ind_var13_largo' 'ind_var13_medio_0' 'ind_var13_medio' 'ind_var14'
 'ind_var17_0' 'ind_var17' 'ind_var18_0' 'ind_var18' 'ind_var19'
 'ind_var20_0' 'ind_var20' 'ind_var27_0' 'ind_var28_0' 'ind_var28'
 'ind_var27' 'ind_var29_0' 'ind_var29' 'ind_var30_0' 'ind_var31_0'
 'ind_var31' 'ind_var32_cte' 'ind_var32_0' 'ind_var32' 'ind_var33_0'
 'ind_var33' 'ind_var34_0' 'ind_var34' 'ind_var40' 'ind_var41' 'ind_var39'
 'ind_var44_0' 'ind_var44' 'ind_var46_0' 'ind_var46' 'num_var6_0'
 'num_var6' 'num_var13_medio_0' 'num_var13_medio' 'num_var18_0'
 'num_var18' 'num_var27_0' 'num_var28_0' 'num_var28' 'num_var27'
 'num_var29_0' 'num_var29' 'num_var33' 'num_var34_0' 'num_var34'
 'num_var41' 'num_var46_0' 'num_var46' 'saldo_var28' 'saldo_var27'
 'saldo_var41' 'saldo_var46' 'imp_amort_var18_hace3'
 'imp_amort_var34_hace3' 'imp_reemb_var13_hace3' 'imp_reemb_var33_hace3'
 'imp_trasp_var17_out_hace3' 'imp_trasp_var33_out_hace3'
 'ind_var7_emit_ul

In [8]:
df = df.drop(dropped_features, axis = 1)
df.head()

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


#### 2.4.- Dealing with outliers

If we plan to use a parametric statistical method, the normality assumption should be asssessed. As we have less than thousands samples (observations), it would be reliable to use the Shapiro - Wilk Test of Normality. If the variable does not have a normal distribution, instead of a Z-Score method, we should go with a non parametric method, such as the IQR.

In [9]:
from scipy.stats import shapiro
A['stat', 'p'] = shapiro(df['var15'])
alpha = 0.05 
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0: normal)')
else:
    print('Sample does not looks Gaussian (reject H0: not normal)')

Sample does not looks Gaussian (reject H0: not normal)




In [15]:
shapiro_stats = df.apply(shapiro, axis = 0)

dfs = shapiro_stats.loc[1, :]
dfs.head(30)

var3                       0.0
var15                      0.0
imp_ent_var16_ult1         0.0
imp_op_var39_comer_ult1    0.0
imp_op_var39_comer_ult3    0.0
imp_op_var40_comer_ult1    0.0
imp_op_var40_comer_ult3    0.0
imp_op_var40_efect_ult1    0.0
imp_op_var40_efect_ult3    0.0
imp_op_var40_ult1          0.0
imp_op_var41_comer_ult1    0.0
imp_op_var41_comer_ult3    0.0
imp_op_var41_efect_ult1    0.0
imp_op_var41_efect_ult3    0.0
imp_op_var41_ult1          0.0
imp_op_var39_efect_ult1    0.0
imp_op_var39_efect_ult3    0.0
imp_op_var39_ult1          0.0
imp_sal_var16_ult1         0.0
ind_var1_0                 0.0
ind_var5_0                 0.0
ind_var5                   0.0
ind_var8_0                 0.0
ind_var8                   0.0
ind_var12_0                0.0
ind_var12                  0.0
ind_var13_0                0.0
ind_var13_corto_0          0.0
ind_var13_corto            0.0
ind_var13_largo_0          0.0
Name: 1, dtype: float64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('TARGET', axis = 1), df['TARGET'], test_size = 0.3, random_state = 31)

In [12]:
pipeline_lr_s = Pipeline([('scalar1', StandardScaler()),
                        ('pca1', PCA(n_components = 2)),
                        ('lr_classifier1', LogisticRegression(random_state = 0))])
                        

In [16]:
pipeline_lr_n = Pipeline([('normal1', MinMaxScaler()),
                        ('pca1', PCA(n_components = 2)),
                        ('lr_classifier2', LogisticRegression(random_state = 0))])
                        

In [17]:
pipeline_dt = Pipeline([('scalar2', StandardScaler()),
                        ('pca2', PCA(n_components = 2)),
                        ('dt_classifier', DecisionTreeClassifier())])

In [18]:
pipeline_rf = Pipeline([('scalar3', StandardScaler()),
                        ('pca3', PCA(n_components = 2)),
                        ('rf_classifier', RandomForestClassifier())])

In [19]:
#Creating the list of pipelines
pipelines = [pipeline_lr_s, pipeline_lr_n, pipeline_dt, pipeline_rf]

In [26]:
best_accuracy = 0
best_classifier = 0
best_pipeline = ""

In [20]:
#Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression with Standarization',1: 'Logistic Regression with Normalization', 2: 'Decision Tree', 3: 'Random Forest'}

#Fitting the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [24]:
for i, model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))

Logistic Regression with Standarization Test Accuracy: 0.9598351311058494
Logistic Regression with Normalization Test Accuracy: 0.9598351311058494
Decision Tree Test Accuracy: 0.9298430237656757
Random Forest Test Accuracy: 0.9539594843462247


In [27]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_classifier = i
print('Classifier with the best accuracy: {}'.format(pipe_dict[best_classifier]))

Classifier with the best accuracy: Logistic Regression with Standarization
