# Bulding ML model

## Import libraries

In [229]:
#Standard libraries for data analysis:
    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import norm, skew, shapiro
from scipy import stats
import statsmodels.api as sm
import re #regex

# sklearn modules for data preprocessing:
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#sklearn modules for Model Selection:
from sklearn import svm, tree, linear_model, neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV


#Standard libraries for data visualization:
import seaborn as sns
from scipy.stats import boxcox 
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib 
%matplotlib inline
color = sns.color_palette()
import matplotlib.ticker as mtick
from pandas.plotting import scatter_matrix
from sklearn.metrics import roc_curve


pd.set_option('display.max_columns', None)

## Load dataset

csv

In [230]:
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

In [231]:
data = pd.concat([numerical, categorical, target], axis = 1)
data.shape

(95412, 339)

## Data Wrangling

## Exploratory Data Analysis (EDA)

## X/Y Split 

In [232]:
def x_y(df):
    X = df.drop(['TARGET_B','TARGET_D'], axis = 1)
    y = df['TARGET_B']
    return X,y

In [233]:
X,y = x_y(data)


In [234]:
# split into numerical and categorical

In [235]:
def Xnum_Xcat(X):
    X_cat = X.select_dtypes(include = object)
    X_num =X.select_dtypes(include = np.number)
    
    display(X_cat.shape)
    display(X_num.shape)
    
    return X_cat,X_num

In [236]:
X_cat,X_num = Xnum_Xcat(X)


(95412, 7)

(95412, 330)

## Train / Test Split

## Feature Selection

Variance Threshold Feature Selection

In [237]:
# to look at VarianceThresholds we need all the variable to be on the same scale
# Variance Threshold Feature Selection only works with numerical data, encoding categorical data is required 

In [238]:
def scaling_encoding(X_num,X_cat):
    

    # MinMaxScale numerical features to ensure that all variables are on the same scale
    scaler = MinMaxScaler().fit(X_num)

    X_num_scaled = scaler.transform(X_num)
    X_num_scaled = pd.DataFrame(X_num_scaled)
    X_num_scaled.columns = X_num.columns
    X_num_scaled.reset_index(drop = True, inplace = True)
    
    
    
    # Encode to ensure that all variables are on the same scale

    encoder = OneHotEncoder(drop='first').fit(X_cat)
    
    cols = encoder.get_feature_names_out(input_features=X_cat.columns)
    X_cat_encoded = pd.DataFrame(encoder.transform(X_cat).toarray(),columns=cols)
    X_cat_encoded.reset_index(drop = True, inplace = True)

    
    
    # Put back together numerical and categorical variables after scaling / encoding

    X_normalized = pd.concat([X_num_scaled,X_cat_encoded], axis=1)
    
    print(X_normalized.shape)
    
    return X_normalized

In [239]:
X_normalized = scaling_encoding(X_num,X_cat)


(95412, 354)


Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,STATE_other,HOMEOWNR_U,GENDER_M,GENDER_other,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000000,0.608247,0.666667,1.000000,0.000000,0.000000,0.393939,0.343434,0.181818,0.101010,0.020202,0.011494,0.555556,0.010051,0.011108,0.009378,0.000000,0.353535,0.656566,0.474747,0.535354,0.929293,0.010101,0.000000,0.000000,0.111111,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.113402,0.000000,0.000000,0.000000,0.464286,0.571429,0.607143,0.476190,0.595238,0.642857,0.333333,0.313131,0.424242,0.272727,0.111111,0.141414,0.181818,0.171717,0.131313,0.111111,0.151515,0.121212,0.111111,0.343434,0.252525,0.181818,0.262626,0.101010,0.232323,0.181818,0.333333,0.494949,0.282828,0.121212,0.040404,0.616162,0.070707,0.164384,0.191919,0.304615,0.394286,0.979798,0.959596,0.020202,0.020202,0.000000,0.000000,0.070707,0.070707,0.000000,0.079833,0.105833,0.230769,0.153846,0.868687,0.141414,0.969697,0.040404,0.070707,0.383838,0.808081,0.707071,0.323232,0.848485,0.161616,0.060606,0.04,0.050505,0.090909,0.151515,0.030303,0.226667,0.505051,0.252525,0.000000,0.000000,0.000000,0.020202,0.070707,0.131313,0.272727,0.474747,0.000000,0.010101,0.616162,0.682353,0.677778,0.245902,0.100,0.020202,0.000000,0.000000,0.141414,0.010101,0.000000,0.000000,0.020202,0.050505,0.171717,0.737374,0.000000,0.271889,0.774120,0.204667,0.212000,0.232667,0.252000,0.073818,0.131313,0.232323,0.232323,0.232323,0.151515,0.010101,0.00,0.000000,0.010101,0.040404,0.252525,0.242424,0.262626,0.171717,0.020202,0.00,0.000000,0.020202,0.282828,0.040404,0.515152,0.010101,0.464646,0.545455,0.030303,0.888889,0.080808,0.000000,0.000000,0.0,0.0,0.00,0.000000,0.040404,0.010101,0.131313,0.155556,0.210526,0.020202,0.454545,0.565657,0.646465,0.505051,0.646465,0.444444,0.626263,0.535354,1.000000,0.000000,0.000000,0.090909,0.030303,0.080808,0.131313,0.090909,0.000000,0.054545,0.090909,0.030303,0.151515,0.191919,0.050505,0.040404,0.030303,0.000000,0.030303,0.414141,0.010101,0.000000,0.070707,0.131313,0.060606,0.050505,0.000000,0.059701,0.090909,0.040404,0.010101,0.030303,0.101010,0.020202,0.010101,0.070707,0.787879,0.020202,0.000000,0.705882,0.161616,0.101010,0.393939,0.212121,0.216216,0.040404,0.030303,0.051546,0.202020,0.100000,0.263889,0.040404,0.000000,0.000000,0.000000,0.181818,0.393939,0.000000,0.343434,0.232323,0.181818,0.161616,0.012048,0.040404,0.000000,0.250000,0.000000,0.000000,0.050505,0.018182,0.000000,0.000000,0.000000,0.000000,0.00,0.074074,0.0,0.030303,0.747475,0.888889,0.080808,0.000000,0.040404,0.969697,0.777778,0.191919,0.419355,0.596154,0.050505,0.141414,0.141414,0.313131,0.545455,0.464646,0.000000,0.000000,0.909091,0.000000,0.101010,0.000000,0.0,0.000000,0.333333,0.656566,0.404040,1.000000,1.000000,0.285714,0.4,0.101010,0.070707,0.433333,0.366492,0.315789,0.168831,0.023965,0.127119,0.341463,0.005,0.001401,0.010,0.003676,0.006465,0.498045,0.0,1.000000,0.622951,0.673077,1.0,0.333333,0.428571,0.0,0.381443,1.000000,0.772727,0.636364,0.863636,0.090909,0.0,1.000000,0.927083,0.909091,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.000014,0.463918,0.833333,1.000000,0.066390,0.000000,0.151515,0.555556,0.111111,0.060606,0.020202,0.011494,1.000000,0.036585,0.039552,0.028190,1.000000,0.000000,0.000000,0.505051,0.505051,0.676768,0.000000,0.000000,0.313131,0.060606,0.181818,0.027778,0.060606,0.059701,0.304348,0.0,0.0,0.020619,0.000000,0.012346,0.046512,0.404762,0.488095,0.511905,0.380952,0.500000,0.535714,0.426667,0.333333,0.464646,0.212121,0.131313,0.141414,0.333333,0.232323,0.101010,0.040404,0.020202,0.111111,0.161616,0.363636,0.222222,0.151515,0.121212,0.010101,0.050505,0.040404,0.212121,0.757576,0.555556,0.232323,0.090909,0.696970,0.040404,0.041096,0.242424,0.487692,0.514286,1.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.911333,0.869667,0.923077,0.769231,0.969697,0.040404,0.979798,0.030303,0.090909,0.595960,0.949495,0.888889,0.555556,0.959596,0.050505,0.040404,0.02,0.030303,0.050505,0.040404,0.020202,0.240000,0.444444,0.050505,0.000000,0.000000,0.000000,0.979798,0.989899,0.989899,0.989899,1.000000,0.949495,0.000000,0.838384,0.894118,0.811111,0.344262,0.125,0.000000,0.000000,0.000000,0.040404,0.000000,0.000000,0.000000,0.919192,0.919192,0.919192,0.949495,0.478632,0.019969,0.911464,0.725333,0.730667,0.684000,0.691333,0.207279,0.020202,0.060606,0.020202,0.050505,0.151515,0.141414,0.26,0.163934,0.333333,0.020202,0.050505,0.020202,0.050505,0.151515,0.141414,0.28,0.101010,0.323232,0.060606,0.020202,0.666667,0.030303,0.565657,0.444444,0.090909,0.808081,0.141414,0.000000,0.000000,0.0,0.0,0.00,0.000000,0.060606,0.000000,0.020202,0.266667,0.421053,0.121212,0.717172,0.707071,0.838384,0.585859,0.818182,0.575758,0.646465,0.575758,1.000000,1.000000,0.000000,0.222222,0.242424,0.040404,0.212121,0.131313,0.046512,0.018182,0.060606,0.000000,0.040404,0.010101,0.000000,0.030303,0.010101,0.000000,0.060606,0.131313,0.010101,0.031250,0.080808,0.181818,0.111111,0.040404,0.030303,0.059701,0.101010,0.070707,0.111111,0.010101,0.060606,0.020202,0.010101,0.161616,0.696970,0.050505,0.020202,0.941176,0.050505,0.050505,0.121212,0.212121,0.189189,0.303030,0.202020,0.144330,0.242424,0.133333,0.333333,0.101010,0.000000,0.000000,0.000000,0.080808,0.151515,0.000000,0.555556,0.101010,0.111111,0.000000,0.000000,0.020202,0.000000,0.032609,0.021277,0.071429,0.020202,0.054545,0.014706,0.010101,0.000000,0.057692,0.00,0.000000,0.0,0.424242,0.393939,0.505051,0.070707,0.272727,0.161616,1.000000,0.929293,0.535354,0.161290,0.192308,0.020202,0.262626,0.565657,0.979798,1.000000,0.000000,0.000000,0.000000,0.969697,0.000000,0.040404,0.000000,0.0,0.000000,1.000000,0.000000,1.000000,1.000000,1.000000,0.952381,0.8,0.060606,0.050505,0.183333,0.146597,0.315789,0.155844,0.003590,0.008475,0.024390,0.010,0.004004,0.025,0.016544,0.014399,0.774510,0.0,0.333333,0.000000,0.250000,1.0,0.000000,0.785714,0.0,0.536082,0.090909,0.818182,0.818182,0.909091,1.000000,0.0,1.000000,0.968750,0.818182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.000014,0.624862,0.333333,0.111111,0.008299,0.000000,0.202020,0.292929,0.333333,0.060606,0.080808,0.011494,0.111111,0.070931,0.085837,0.075389,0.000000,0.020202,0.989899,0.494949,0.515152,0.969697,0.020202,0.000000,0.000000,0.020202,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.020619,0.000000,0.000000,0.000000,0.416667,0.511905,0.547619,0.440476,0.535714,0.583333,0.306667,0.353535,0.404040,0.252525,0.131313,0.202020,0.191919,0.161616,0.131313,0.101010,0.080808,0.151515,0.141414,0.303030,0.222222,0.191919,0.252525,0.101010,0.232323,0.212121,0.353535,0.444444,0.222222,0.060606,0.020202,0.636364,0.090909,0.123288,0.191919,0.281538,0.362857,0.696970,0.696970,0.010101,0.060606,0.050505,0.030303,0.030303,0.030303,0.000000,0.082833,0.091000,0.153846,0.076923,0.787879,0.222222,0.939394,0.070707,0.181818,0.363636,0.767677,0.656566,0.303030,0.868687,0.141414,0.070707,0.04,0.050505,0.111111,0.171717,0.030303,0.226667,0.606061,0.181818,0.000000,0.010101,0.000000,0.000000,0.010101,0.060606,0.181818,0.505051,0.000000,0.040404,0.363636,0.576471,0.566667,0.229508,0.125,0.040404,0.020202,0.242424,0.111111,0.020202,0.030303,0.060606,0.000000,0.020202,0.090909,0.444444,0.000000,0.431644,0.587968,0.167333,0.194667,0.194667,0.226667,0.066329,0.323232,0.181818,0.202020,0.151515,0.121212,0.020202,0.00,0.000000,0.010101,0.202020,0.191919,0.242424,0.181818,0.161616,0.020202,0.00,0.000000,0.010101,0.282828,0.080808,0.313131,0.111111,0.383838,0.626263,0.080808,0.747475,0.222222,0.000000,0.000000,0.0,0.0,0.00,0.020202,0.020202,0.010101,0.212121,0.211111,0.315789,0.060606,0.616162,0.656566,0.737374,0.595960,0.707071,0.565657,0.787879,0.626263,0.828283,1.000000,0.040404,0.101010,0.050505,0.020202,0.060606,0.121212,0.000000,0.018182,0.090909,0.050505,0.181818,0.202020,0.050505,0.070707,0.060606,0.000000,0.111111,0.333333,0.040404,0.046875,0.020202,0.121212,0.030303,0.030303,0.020202,0.000000,0.070707,0.080808,0.030303,0.030303,0.060606,0.070707,0.010101,0.080808,0.747475,0.030303,0.010101,0.705882,0.222222,0.202020,0.282828,0.161616,0.162162,0.050505,0.030303,0.010309,0.232323,0.033333,0.222222,0.060606,0.000000,0.000000,0.000000,0.101010,0.212121,0.000000,0.282828,0.232323,0.323232,0.080808,0.012048,0.141414,0.032258,0.054348,0.000000,0.000000,0.070707,0.000000,0.000000,0.000000,0.000000,0.000000,0.02,0.000000,0.0,0.020202,0.848485,0.969697,0.030303,0.000000,0.000000,0.929293,0.656566,0.292929,0.290323,0.423077,0.030303,0.121212,0.232323,0.505051,0.696970,0.313131,0.000000,0.000000,0.000000,0.060606,0.353535,0.444444,0.0,0.151515,0.222222,0.777778,0.171717,0.979798,0.929293,0.428571,0.4,0.060606,0.050505,0.416667,0.308901,0.315789,0.168831,0.019954,0.110169,0.341463,0.002,0.002202,0.005,0.011029,0.006204,0.078617,1.0,1.000000,0.967213,0.807692,1.0,0.333333,0.500000,0.0,0.000000,0.090909,0.727273,0.909091,0.772727,0.545455,0.0,1.000000,0.937500,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.000000,0.711340,0.000000,0.444444,0.008299,0.000000,0.232323,0.141414,0.313131,0.030303,0.000000,0.034483,0.000000,0.006484,0.006732,0.006186,0.000000,0.080808,0.929293,0.545455,0.464646,0.616162,0.000000,0.000000,0.111111,0.323232,0.272727,0.027778,0.000000,0.000000,0.000000,0.0,0.0,0.319588,0.000000,0.000000,0.011628,0.380952,0.476190,0.523810,0.404762,0.511905,0.559524,0.333333,0.454545,0.353535,0.202020,0.151515,0.252525,0.171717,0.171717,0.121212,0.070707,0.070707,0.202020,0.171717,0.303030,0.141414,0.191919,0.252525,0.111111,0.232323,0.232323,0.272727,0.505051,0.303030,0.151515,0.080808,0.636364,0.090909,0.082192,0.232323,0.306154,0.404286,0.858586,0.838384,0.030303,0.040404,0.010101,0.000000,0.020202,0.000000,0.020202,0.166667,0.210500,0.153846,0.076923,0.484848,0.525253,0.939394,0.070707,0.060606,0.363636,0.737374,0.616162,0.303030,0.848485,0.161616,0.060606,0.06,0.030303,0.212121,0.121212,0.040404,0.173333,0.363636,0.131313,0.000000,0.000000,0.000000,0.101010,0.252525,0.505051,0.696970,0.929293,0.101010,0.151515,0.424242,0.647059,0.555556,0.245902,0.125,0.040404,0.000000,0.090909,0.424242,0.040404,0.000000,0.050505,0.010101,0.080808,0.171717,0.343434,0.997863,0.102919,0.978434,0.257333,0.258667,0.264000,0.282000,0.086693,0.272727,0.121212,0.040404,0.262626,0.222222,0.050505,0.00,0.000000,0.040404,0.353535,0.050505,0.060606,0.121212,0.303030,0.060606,0.00,0.000000,0.050505,0.222222,0.141414,0.262626,0.202020,0.464646,0.545455,0.030303,0.585859,0.363636,0.000000,0.000000,0.0,0.0,0.00,0.060606,0.000000,0.000000,0.171717,0.144444,0.197368,0.000000,0.434343,0.696970,0.818182,0.535354,0.686869,0.454545,0.333333,0.313131,0.000000,1.000000,0.232323,0.171717,0.030303,0.000000,0.060606,0.060606,0.000000,0.000000,0.131313,0.424242,0.121212,0.000000,0.000000,0.000000,0.424242,0.000000,0.060606,0.030303,0.000000,0.000000,0.000000,0.232323,0.030303,0.030303,0.060606,0.000000,0.030303,0.030303,0.030303,0.030303,0.030303,0.000000,0.030303,0.060606,0.878788,0.000000,0.000000,0.705882,0.282828,0.121212,0.141414,0.272727,0.270270,0.030303,0.050505,0.000000,0.191919,0.033333,0.236111,0.000000,0.000000,0.000000,0.000000,0.131313,0.232323,0.000000,0.141414,0.404040,0.313131,0.161616,0.000000,0.010101,0.000000,0.141304,0.000000,0.000000,0.040404,0.000000,0.000000,0.000000,0.069767,0.000000,0.00,0.000000,0.0,0.292929,0.676768,0.565657,0.414141,0.030303,0.000000,0.949495,0.434343,0.272727,0.129032,0.730769,0.000000,0.101010,0.191919,0.393939,0.454545,0.555556,0.000000,0.000000,0.454545,0.222222,0.171717,0.000000,0.0,0.161616,0.232323,0.777778,0.222222,0.939394,0.898990,0.761905,0.4,0.060606,0.060606,0.433333,0.324607,0.315789,0.168831,0.010135,0.063559,0.170732,0.002,0.001201,0.010,0.008272,0.005534,0.899764,1.0,1.000000,0.655738,0.826923,1.0,0.333333,0.285714,0.0,0.288660,0.000000,0.545455,0.909091,0.863636,0.909091,0.0,1.000000,0.906250,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.000000,0.793814,0.333333,0.222222,0.248963,0.010101,0.282828,0.090909,0.535354,0.262626,0.030303,0.022989,1.000000,0.025532,0.026382,0.021495,1.000000,0.000000,0.000000,0.464646,0.545455,0.020202,0.989899,0.000000,0.000000,0.010101,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.392857,0.535714,0.595238,0.428571,0.547619,0.595238,0.360000,0.343434,0.434343,0.232323,0.141414,0.212121,0.131313,0.151515,0.202020,0.121212,0.050505,0.131313,0.151515,0.343434,0.191919,0.191919,0.313131,0.070707,0.272727,0.161616,0.262626,0.575758,0.363636,0.242424,0.141414,0.424242,0.171717,0.123288,0.333333,0.361538,0.461429,1.000000,0.989899,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.096000,0.099000,0.307692,0.230769,0.909091,0.101010,0.979798,0.030303,0.000000,0.424242,0.828283,0.494949,0.222222,0.929293,0.080808,0.202020,0.06,0.171717,0.090909,0.232323,0.010101,0.013333,0.010101,0.000000,0.381818,0.585859,0.191919,0.000000,0.010101,0.020202,0.161616,0.676768,0.000000,0.020202,0.454545,0.611765,0.588889,0.262295,0.150,0.000000,0.000000,0.000000,0.090909,0.000000,0.000000,0.000000,0.252525,0.585859,0.747475,0.838384,0.534188,0.195084,0.599319,0.160000,0.166667,0.195333,0.214000,0.056359,0.242424,0.292929,0.232323,0.131313,0.040404,0.040404,0.00,0.000000,0.020202,0.212121,0.303030,0.222222,0.161616,0.040404,0.050505,0.00,0.000000,0.030303,0.353535,0.080808,0.111111,0.141414,0.202020,0.808081,0.040404,0.737374,0.222222,0.010101,0.010101,0.0,0.0,0.00,0.030303,0.010101,0.020202,0.010101,0.266667,0.355263,0.030303,0.767677,0.616162,0.737374,0.515152,0.656566,0.494949,0.808081,0.313131,0.818182,1.000000,0.101010,0.171717,0.080808,0.020202,0.060606,0.151515,0.069767,0.127273,0.222222,0.020202,0.090909,0.000000,0.070707,0.020202,0.020202,0.000000,0.060606,0.010101,0.050505,0.031250,0.020202,0.121212,0.020202,0.070707,0.060606,0.059701,0.151515,0.292929,0.040404,0.030303,0.262626,0.030303,0.020202,0.070707,0.494949,0.121212,0.010101,0.705882,0.161616,0.202020,0.303030,0.131313,0.081081,0.121212,0.050505,0.020619,0.262626,0.033333,0.277778,0.070707,0.010309,0.010101,0.012821,0.151515,0.282828,0.133333,0.090909,0.161616,0.535354,0.202020,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.010101,0.656566,1.000000,0.000000,0.000000,0.000000,0.909091,0.454545,0.181818,0.806452,0.653846,0.000000,0.010101,0.030303,0.060606,0.333333,0.676768,0.000000,0.000000,0.090909,0.141414,0.727273,0.030303,0.0,0.000000,1.000000,0.010101,0.212121,1.000000,0.969697,0.285714,0.4,0.070707,0.111111,0.700000,0.570681,0.526316,0.311688,0.025443,0.152542,0.195122,0.003,0.002002,0.015,0.012868,0.005586,0.037079,1.0,0.333333,0.409836,0.288462,1.0,0.333333,0.214286,0.0,0.206186,0.000000,0.818182,0.818182,0.954545,0.000000,0.5,0.000000,0.822917,0.181818,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.000014,0.624862,0.666667,1.000000,0.000000,0.141414,0.363636,0.474747,0.111111,0.070707,0.080808,0.149425,1.000000,0.277403,0.305142,0.283507,1.000000,0.000000,0.000000,0.505051,0.505051,0.787879,0.101010,0.060606,0.040404,0.050505,0.000000,0.000000,0.000000,0.014925,0.021739,0.0,0.0,0.030928,0.017544,0.000000,0.023256,0.333333,0.416667,0.452381,0.345238,0.452381,0.488095,0.400000,0.454545,0.373737,0.181818,0.161616,0.313131,0.252525,0.151515,0.080808,0.030303,0.010101,0.202020,0.181818,0.313131,0.181818,0.131313,0.070707,0.030303,0.050505,0.202020,0.323232,0.484848,0.282828,0.101010,0.040404,0.585859,0.151515,0.041096,0.242424,0.300000,0.387143,0.545455,0.383838,0.080808,0.323232,0.242424,0.141414,0.000000,0.000000,0.000000,0.164667,0.170833,0.461538,0.461538,0.565657,0.444444,0.898990,0.111111,0.030303,0.444444,0.727273,0.565657,0.323232,0.838384,0.171717,0.121212,0.06,0.101010,0.161616,0.151515,0.080808,0.253333,0.555556,0.050505,0.054545,0.060606,0.000000,0.020202,0.101010,0.494949,0.737374,0.929293,0.000000,0.040404,0.404040,0.611765,0.588889,0.245902,0.100,0.242424,0.080808,0.131313,0.141414,0.151515,0.121212,0.030303,0.696970,0.848485,0.929293,0.979798,0.040598,0.000000,0.843360,0.288667,0.320667,0.332667,0.356667,0.107762,0.111111,0.131313,0.131313,0.212121,0.222222,0.131313,0.08,0.032787,0.020202,0.090909,0.111111,0.111111,0.212121,0.242424,0.161616,0.08,0.020202,0.020202,0.090909,0.060606,0.707071,0.060606,0.636364,0.373737,0.272727,0.767677,0.151515,0.020202,0.020202,0.0,0.0,0.00,0.050505,0.020202,0.010101,0.020202,0.200000,0.263158,0.020202,0.696970,0.818182,0.898990,0.737374,0.838384,0.696970,0.696970,0.575758,0.616162,0.949495,0.070707,0.151515,0.161616,0.050505,0.101010,0.212121,0.000000,0.054545,0.111111,0.010101,0.111111,0.020202,0.030303,0.030303,0.010101,0.061538,0.060606,0.040404,0.070707,0.046875,0.030303,0.171717,0.070707,0.050505,0.030303,0.014925,0.090909,0.080808,0.070707,0.141414,0.070707,0.080808,0.131313,0.060606,0.595960,0.070707,0.000000,0.800000,0.020202,0.070707,0.282828,0.333333,0.216216,0.151515,0.080808,0.030928,0.262626,0.066667,0.263889,0.080808,0.082474,0.151515,0.025641,0.202020,0.353535,0.166667,0.484848,0.151515,0.111111,0.252525,0.012048,0.050505,0.032258,0.097826,0.000000,0.000000,0.040404,0.018182,0.014706,0.010101,0.000000,0.000000,0.02,0.037037,0.0,0.040404,0.262626,0.929293,0.030303,0.020202,0.040404,0.959596,0.606061,0.191919,0.096774,0.269231,0.000000,0.070707,0.323232,0.787879,0.919192,0.090909,0.066667,0.080645,0.868687,0.010101,0.121212,0.000000,0.0,0.010101,0.939394,0.070707,0.989899,1.000000,0.989899,0.761905,0.8,0.040404,0.030303,0.083333,0.052356,0.263158,0.142857,0.001267,0.000000,0.000000,0.025,0.004004,0.025,0.008272,0.023745,0.962399,0.0,0.000000,0.180328,0.500000,1.0,0.333333,0.928571,0.0,0.000000,0.090909,0.954545,0.090909,0.954545,0.090909,0.5,0.090909,1.000000,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
95408,0.000014,0.484536,1.000000,1.000000,0.004149,0.000000,0.313131,0.434343,0.191919,0.040404,0.010101,0.000000,1.000000,0.012705,0.013549,0.010197,0.969697,0.000000,0.040404,0.515152,0.494949,0.919192,0.030303,0.000000,0.020202,0.060606,0.045455,0.000000,0.010101,0.000000,0.000000,0.0,0.0,0.051546,0.000000,0.000000,0.011628,0.357143,0.476190,0.476190,0.333333,0.488095,0.511905,0.520000,0.333333,0.424242,0.252525,0.090909,0.191919,0.434343,0.171717,0.070707,0.040404,0.020202,0.101010,0.161616,0.353535,0.232323,0.161616,0.090909,0.020202,0.070707,0.101010,0.202020,0.707071,0.525253,0.252525,0.060606,0.737374,0.040404,0.027397,0.202020,0.472308,0.494286,0.898990,0.888889,0.010101,0.010101,0.000000,0.000000,0.000000,0.000000,0.000000,0.279833,0.287167,0.230769,0.230769,0.888889,0.121212,0.979798,0.030303,0.000000,0.636364,0.898990,0.858586,0.606061,0.969697,0.040404,0.020202,0.02,0.010101,0.070707,0.050505,0.010101,0.373333,0.585859,0.050505,0.036364,0.020202,0.000000,0.181818,0.717172,0.888889,0.919192,0.979798,0.050505,0.010101,0.777778,0.964706,0.833333,0.327869,0.100,0.010101,0.000000,0.101010,0.070707,0.010101,0.000000,0.050505,0.161616,0.262626,0.444444,0.797980,0.358974,0.308756,0.701476,0.537333,0.557333,0.534667,0.566000,0.152060,0.080808,0.090909,0.070707,0.060606,0.111111,0.292929,0.26,0.032787,0.151515,0.101010,0.000000,0.080808,0.020202,0.131313,0.353535,0.32,0.030303,0.131313,0.080808,0.050505,0.616162,0.070707,0.838384,0.171717,0.363636,0.808081,0.040404,0.040404,0.040404,0.0,0.0,0.00,0.060606,0.050505,0.030303,0.030303,0.277778,0.421053,0.101010,0.616162,0.737374,0.888889,0.565657,0.878788,0.525253,0.484848,0.434343,1.000000,0.000000,0.000000,0.181818,0.313131,0.000000,0.131313,0.171717,0.000000,0.018182,0.020202,0.040404,0.060606,0.000000,0.030303,0.050505,0.010101,0.123077,0.080808,0.090909,0.030303,0.109375,0.090909,0.131313,0.090909,0.060606,0.000000,0.000000,0.040404,0.070707,0.131313,0.030303,0.040404,0.010101,0.000000,0.040404,0.787879,0.121212,0.000000,0.941176,0.010101,0.060606,0.121212,0.242424,0.189189,0.363636,0.141414,0.092784,0.353535,0.166667,0.444444,0.070707,0.000000,0.000000,0.000000,0.212121,0.313131,0.266667,0.434343,0.050505,0.191919,0.151515,0.012048,0.121212,0.032258,0.152174,0.000000,0.000000,0.040404,0.000000,0.000000,0.010101,0.000000,0.000000,0.00,0.037037,0.0,0.020202,0.515152,0.949495,0.030303,0.000000,0.020202,1.000000,0.848485,0.292929,0.129032,0.134615,0.020202,0.555556,0.909091,0.949495,0.949495,0.060606,0.000000,0.000000,0.828283,0.020202,0.161616,0.000000,0.0,0.000000,0.696970,0.313131,0.676768,1.000000,0.979798,0.857143,1.0,0.030303,0.020202,0.050000,0.031414,0.157895,0.090909,0.000739,0.000000,0.000000,0.020,0.003003,0.020,0.008272,0.018738,0.639828,1.0,0.000000,0.016393,0.442308,1.0,0.000000,0.928571,0.0,0.515464,0.000000,0.954545,0.181818,0.954545,0.181818,0.5,0.181818,1.000000,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95409,0.000014,0.608247,0.666667,1.000000,0.000000,0.000000,0.181818,0.464646,0.202020,0.070707,0.232323,0.000000,1.000000,0.005593,0.005512,0.005790,1.000000,0.000000,0.000000,0.535354,0.474747,0.828283,0.141414,0.000000,0.010101,0.090909,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.092784,0.000000,0.000000,0.000000,0.333333,0.416667,0.440476,0.357143,0.488095,0.523810,0.426667,0.464646,0.383838,0.171717,0.131313,0.343434,0.212121,0.090909,0.090909,0.090909,0.040404,0.212121,0.171717,0.323232,0.202020,0.101010,0.181818,0.070707,0.171717,0.272727,0.292929,0.444444,0.313131,0.141414,0.050505,0.454545,0.191919,0.068493,0.313131,0.275385,0.382857,0.969697,0.959596,0.010101,0.020202,0.010101,0.000000,0.000000,0.000000,0.000000,0.062667,0.062833,0.307692,0.230769,0.666667,0.343434,0.959596,0.050505,0.101010,0.373737,0.646465,0.434343,0.212121,0.808081,0.202020,0.161616,0.04,0.141414,0.212121,0.202020,0.090909,0.266667,0.494949,0.121212,0.127273,0.070707,0.010101,0.000000,0.000000,0.000000,0.010101,0.090909,0.000000,0.020202,0.454545,0.600000,0.600000,0.229508,0.125,0.020202,0.000000,0.000000,0.313131,0.020202,0.000000,0.000000,0.030303,0.343434,0.787879,0.919192,0.431624,0.093702,0.625426,0.175333,0.176000,0.212667,0.230000,0.069779,0.212121,0.262626,0.202020,0.181818,0.121212,0.000000,0.06,0.000000,0.000000,0.262626,0.181818,0.171717,0.111111,0.212121,0.000000,0.12,0.000000,0.000000,0.101010,0.131313,0.262626,0.262626,0.434343,0.575758,0.030303,0.838384,0.171717,0.000000,0.000000,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.252525,0.188889,0.223684,0.000000,0.696970,0.696970,0.707071,0.696970,0.707071,0.696970,0.777778,0.242424,0.626263,0.000000,0.252525,0.050505,0.131313,0.090909,0.050505,0.222222,0.000000,0.036364,0.141414,0.000000,0.131313,0.090909,0.050505,0.020202,0.000000,0.000000,0.040404,0.141414,0.030303,0.171875,0.000000,0.101010,0.050505,0.020202,0.000000,0.074627,0.060606,0.191919,0.030303,0.191919,0.070707,0.232323,0.000000,0.000000,0.525253,0.181818,0.000000,0.705882,0.050505,0.030303,0.515152,0.232323,0.189189,0.111111,0.000000,0.061856,0.323232,0.133333,0.375000,0.070707,0.000000,0.000000,0.000000,0.090909,0.181818,0.000000,0.464646,0.000000,0.202020,0.202020,0.024096,0.080808,0.000000,0.152174,0.000000,0.000000,0.000000,0.018182,0.000000,0.000000,0.000000,0.000000,0.02,0.000000,0.0,0.060606,0.828283,0.929293,0.050505,0.030303,0.000000,0.939394,0.424242,0.121212,0.193548,0.980769,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.979798,0.000000,0.000000,0.000000,0.0,0.040404,1.000000,0.000000,1.000000,1.000000,1.000000,0.238095,0.4,0.030303,0.111111,0.216667,0.151832,0.368421,0.207792,0.004751,0.025424,0.097561,0.003,0.001001,0.010,0.002757,0.007009,0.988852,1.0,0.666667,0.540984,0.557692,1.0,0.666667,0.857143,0.0,0.391753,0.000000,0.954545,0.181818,0.909091,0.000000,0.5,0.818182,0.979167,0.818182,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
95410,0.000000,0.587629,1.000000,1.000000,0.000000,0.000000,0.282828,0.353535,0.202020,0.090909,0.010101,0.011494,0.777778,0.017690,0.018177,0.014349,1.000000,0.000000,0.000000,0.474747,0.535354,0.929293,0.010101,0.010101,0.050505,0.080808,0.000000,0.013889,0.020202,0.000000,0.021739,0.0,0.0,0.051546,0.000000,0.000000,0.034884,0.404762,0.500000,0.535714,0.428571,0.535714,0.583333,0.333333,0.383838,0.404040,0.222222,0.121212,0.212121,0.212121,0.181818,0.121212,0.070707,0.090909,0.131313,0.161616,0.343434,0.202020,0.171717,0.202020,0.040404,0.161616,0.090909,0.262626,0.656566,0.414141,0.171717,0.060606,0.565657,0.090909,0.109589,0.272727,0.403077,0.462857,1.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.050505,0.040404,0.010101,0.403500,0.409833,0.846154,0.769231,0.888889,0.121212,1.000000,0.010101,0.000000,0.444444,0.858586,0.717172,0.363636,0.848485,0.161616,0.080808,0.04,0.060606,0.090909,0.121212,0.060606,0.253333,0.565657,0.161616,0.000000,0.000000,0.000000,0.898990,0.969697,1.000000,1.000000,1.000000,0.090909,0.000000,0.909091,0.764706,0.755556,0.295082,0.125,0.000000,0.000000,0.000000,0.121212,0.000000,0.000000,0.000000,0.888889,0.888889,0.909091,0.919192,0.933226,0.019969,0.911464,0.368000,0.362667,0.378667,0.370667,0.091381,0.070707,0.040404,0.111111,0.181818,0.383838,0.151515,0.10,0.049180,0.000000,0.040404,0.060606,0.151515,0.191919,0.383838,0.131313,0.08,0.030303,0.000000,0.252525,0.020202,0.464646,0.030303,0.434343,0.575758,0.090909,0.808081,0.111111,0.000000,0.000000,0.0,0.0,0.04,0.020202,0.060606,0.000000,0.242424,0.200000,0.368421,0.111111,0.525253,0.737374,0.888889,0.606061,0.858586,0.575758,0.707071,0.545455,1.000000,1.000000,0.000000,0.141414,0.161616,0.060606,0.161616,0.171717,0.000000,0.036364,0.121212,0.010101,0.111111,0.020202,0.000000,0.020202,0.010101,0.000000,0.020202,0.222222,0.040404,0.093750,0.040404,0.191919,0.040404,0.070707,0.020202,0.059701,0.060606,0.070707,0.090909,0.040404,0.090909,0.010101,0.010101,0.070707,0.727273,0.080808,0.020202,0.823529,0.070707,0.060606,0.202020,0.353535,0.324324,0.151515,0.050505,0.061856,0.292929,0.133333,0.291667,0.101010,0.000000,0.000000,0.000000,0.131313,0.282828,0.033333,0.353535,0.181818,0.202020,0.080808,0.000000,0.030303,0.032258,0.097826,0.000000,0.000000,0.020202,0.109091,0.014706,0.020202,0.000000,0.000000,0.00,0.000000,0.0,0.141414,0.505051,0.838384,0.080808,0.040404,0.050505,1.000000,0.858586,0.434343,0.290323,0.480769,0.000000,0.000000,0.060606,0.171717,1.000000,0.010101,0.000000,0.000000,1.000000,0.000000,0.010101,0.000000,0.0,0.000000,1.000000,0.000000,1.000000,1.000000,1.000000,0.571429,0.6,0.060606,0.030303,0.583333,0.643979,0.473684,0.389610,0.051204,0.169492,0.439024,0.005,0.003203,0.018,0.003676,0.010875,0.024466,1.0,1.000000,0.163934,0.442308,0.5,0.000000,0.214286,0.0,0.412371,0.363636,0.681818,0.909091,0.954545,0.636364,1.0,0.000000,0.895833,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [240]:

def variance_threshold(var_threshold,X_normalized):
    
    # use the estimator with a low threshold (minimum value of variance we want in out dataset)
    selection = VarianceThreshold(threshold=(var_threshold))

    # Fit
    selection = selection.fit(X_normalized)
    
    # Subset the DataFrame
    data_variance = selection.transform(X_normalized)
    data_variance = pd.DataFrame(data_variance)

    print('Original set: ', X_normalized.shape)
    print('Variance Threshold set: ', data_variance.shape)
    
    # Get list of features removed
    var_list = list(selection.get_support())
    
    drop_columns = [col[0] for col in zip(X_normalized.columns, var_list) if col[1] == False]
    
    return drop_columns

In [241]:
drop_columns = variance_threshold(0.02,X_normalized)
#drop_columns

Original set:  (95412, 354)
Variance Threshold set:  (95412, 114)


In [242]:
def drop_features(df, drop_columns):
    X_normalized_cleaned = df.drop(drop_columns, axis = 1)
    
    print('Original set: ', X_normalized.shape)
    print('New set: ', X_normalized_cleaned.shape)
    
    return X_normalized_cleaned

In [243]:
X_normalized_cleaned = drop_features(X_normalized, drop_columns)

Original set:  (95412, 354)
New set:  (95412, 114)


In [244]:
def KBest(X,target,k):
    
    K_best = SelectKBest(chi2, k=k).fit_transform(X, target)
    
    print('Original set: ',X_normalized.shape) # to be adjust according to the project
    print('Variance Threshold set: ',X_normalized_cleaned.shape)
    print('Kbest set: ',K_best.shape)
    
    # feature extraction
    model = SelectKBest(chi2, k=25).fit(X, target)
    df = pd.DataFrame(data = model.scores_, columns=['score'])
    df['column_name']= X.columns

    # summarize selected features
    display(df.sort_values(by = ['score'],ascending = False).head(k))
    
    # Add columns to drop to a list
    cols = df.sort_values(by = ['score'],ascending = False).head(k)['column_name']
    drop_columns = list(cols)
    
    
    return drop_columns

In [245]:
drop_columns = KBest(X_normalized_cleaned,y,10)

Original set:  (95412, 354)
Variance Threshold set:  (95412, 114)
Kbest set:  (95412, 10)


Unnamed: 0,score,column_name
77,210.29081,RFA_2F
106,92.214021,RFA_2A_G
104,87.261743,RFA_2A_E
87,50.43915,LASTDATE_YR
105,47.49662,RFA_2A_F
28,26.474583,HVP1
29,25.757492,HVP2
30,21.54508,HVP3
33,19.261394,HVP6
31,14.874056,HVP4


In [246]:
X_normalized_cleaned_2 = drop_features(X_normalized_cleaned, drop_columns)
X_normalized_cleaned_2.shape

Original set:  (95412, 354)
New set:  (95412, 104)


(95412, 104)

In [247]:
# Recursive Feature Elimination

def RFE_regression(model,n,X,y):
    
    # define the method
    method = model(max_iter = 400)
    rfe = RFE(method, n_features_to_select=n, verbose=False)
    
    # fit the model
    rfe.fit(X, y)
    
    # Summarize features
    df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
    df['Column_name'] = pd.DataFrame(X).columns
    df = df[df['Rank']==1]
    
    # Create a list with columns to keep
    columns_lst = list(df['Column_name'].values)
    
    # Show only most important features
    df2 = X[[*columns_lst]]
    
    
    print('Original set: ',X_normalized.shape) # to be adjust according to the project
    print('Variance Threshold set: ',X_normalized_cleaned.shape)
    print('Kbest set: ',X_normalized_cleaned_2.shape)
    print('RFE set: ',df2.shape)
    
    
    return df2
    

In [249]:
X_normalized_cleaned_RFE = RFE_regression(LogisticRegression,50,X_normalized_cleaned_2,y)


(95412, 50)

## Training and test data