In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
pd.set_option('display.max_columns',None)
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')


In [3]:
X = pd.concat([categorical, numerical], axis=1)
y = target['TARGET_B']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [5]:
X_train_num = X_train.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)
X_test_num = X_test.select_dtypes(np.number)
X_test_cat = X_test.select_dtypes(object)

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

In [7]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(drop='first')
X_train_cat_encoded = enc.fit_transform(X_train_cat).toarray()
X_test_cat_encoded = enc.transform(X_test_cat).toarray()
column_names = enc.get_feature_names_out(X_train_cat.columns)

In [8]:
X_train_scaled = pd.concat([pd.DataFrame(X_train_num_scaled, columns=X_train_num.columns), pd.DataFrame(X_train_cat_encoded, columns = column_names)], axis=1)
X_test_scaled = pd.concat([pd.DataFrame(X_test_num_scaled, columns=X_test_num.columns), pd.DataFrame(X_test_cat_encoded, columns = column_names)], axis=1)

# Managing imbalance in the dataset

In [9]:
y_train = y_train.reset_index().drop(['index'], axis=1)

In [10]:
trainset= pd.concat([X_train_scaled, y_train], axis=1)
trainset

Unnamed: 0,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,STATE_other,HOMEOWNR_U,GENDER_M,GENDER_other,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,0.442308,1.0,0.000000,0.214286,0.0,0.237113,0.000000,0.636364,0.090909,0.681818,1.000000,0.5,0.000000,0.916667,0.181818,0.000017,0.762887,0.500000,0.666667,0.008299,0.000000,0.313131,0.101010,0.686869,0.060606,0.121212,0.034483,1.000000,0.007153,0.009129,0.007683,1.000000,0.000000,0.000000,0.474747,0.535354,0.757576,0.212121,0.000000,0.020202,0.020202,0.0,0.000000,0.000000,0.000000,0.000000,0.021277,0.000000,0.000000,0.017544,0.0,0.023256,0.404762,0.500000,0.547619,0.428571,0.535714,0.583333,0.333333,0.363636,0.474747,0.171717,0.131313,0.212121,0.191919,0.131313,0.151515,0.121212,0.060606,0.141414,0.141414,0.353535,0.212121,0.161616,0.282828,0.090909,0.252525,0.212121,0.383838,0.424242,0.232323,0.101010,0.040404,0.616162,0.101010,0.095890,0.222222,0.272308,0.371429,0.777778,0.757576,0.028571,0.222222,0.202020,0.101010,0.000000,0.000000,0.0,0.105167,0.107333,0.230769,0.230769,0.717172,0.292929,0.939394,0.070707,0.000000,0.343434,0.757576,0.595960,0.242424,0.888889,0.121212,0.101010,0.04,0.080808,0.090909,0.212121,0.040404,0.186667,0.434343,0.191919,0.162791,0.141414,0.012346,0.000000,0.010101,0.040404,0.232323,0.757576,0.000000,0.000000,0.525253,0.635294,0.633333,0.229508,0.100,0.161616,0.060606,0.000000,0.111111,0.090909,0.090909,0.000000,0.030303,0.080808,0.808081,0.929293,0.337607,0.327189,0.643587,0.225333,0.220667,0.238000,0.228667,0.068232,0.101010,0.313131,0.131313,0.212121,0.212121,0.050505,0.00,0.000000,0.000000,0.080808,0.333333,0.161616,0.212121,0.161616,0.060606,0.00,0.000000,0.000000,0.414141,0.070707,0.575758,0.070707,0.343434,0.666667,0.111111,0.757576,0.202020,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.050505,0.000000,0.050505,0.155556,0.250000,0.050505,0.454545,0.676768,0.666667,0.686869,0.636364,0.686869,0.909091,0.595960,1.000000,1.000000,0.000000,0.272727,0.202020,0.020202,0.101010,0.171717,0.000000,0.000000,0.070707,0.000000,0.040404,0.080808,0.028571,0.030303,0.000000,0.000000,0.020202,0.141414,0.100000,0.078125,0.020202,0.222222,0.030303,0.000000,0.000000,0.000000,0.030303,0.282828,0.070707,0.090909,0.060606,0.121212,0.030303,0.040404,0.595960,0.161616,0.0,0.823529,0.050505,0.101010,0.292929,0.121212,0.162162,0.252525,0.141414,0.072165,0.222222,0.033333,0.263889,0.080808,0.000000,0.000000,0.000000,0.211268,0.313131,0.066667,0.101010,0.303030,0.686869,0.000000,0.000000,0.060606,0.000000,0.012048,0.0,0.0,0.122449,0.018182,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.011111,0.676768,0.989899,0.020202,0.000000,0.000000,0.969697,0.666667,0.131313,0.322581,0.615385,0.000000,0.050505,0.050505,0.141414,0.464646,0.545455,0.0,0.000000,0.212121,0.030303,0.262626,0.474747,0.0,0.030303,1.000000,0.000000,1.000000,1.000000,0.909091,0.285714,0.4,0.090909,0.080808,0.366667,0.230366,0.263158,0.129870,0.008658,0.063559,0.195122,0.0010,0.001001,0.007,0.003676,0.004720,0.127215,1.0,0.666667,0.508197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1.000000,0.5,0.666667,0.928571,0.0,0.463918,0.000000,0.954545,0.090909,0.954545,0.090909,0.5,0.090909,1.000000,0.090909,0.000000,0.536082,0.666667,1.000000,0.000000,0.000000,0.292929,0.242424,0.383838,0.070707,0.080808,0.045977,1.000000,0.045856,0.055403,0.044968,0.000000,0.000000,1.000000,0.494949,0.515152,0.757576,0.161616,0.090909,0.000000,0.010101,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010309,0.000000,0.0,0.000000,0.416667,0.559524,0.607143,0.452381,0.583333,0.630952,0.373333,0.393939,0.404040,0.202020,0.111111,0.181818,0.161616,0.151515,0.141414,0.121212,0.141414,0.141414,0.181818,0.343434,0.191919,0.151515,0.353535,0.141414,0.323232,0.212121,0.333333,0.464646,0.303030,0.141414,0.050505,0.626263,0.080808,0.150685,0.181818,0.290769,0.395714,0.787879,0.777778,0.000000,0.030303,0.030303,0.010101,0.020202,0.020202,0.0,0.042833,0.052667,0.153846,0.076923,0.797980,0.212121,0.898990,0.111111,0.070707,0.393939,0.777778,0.636364,0.303030,0.888889,0.121212,0.090909,0.04,0.070707,0.101010,0.181818,0.020202,0.213333,0.404040,0.181818,0.093023,0.080808,0.049383,0.000000,0.010101,0.020202,0.040404,0.161616,0.000000,0.010101,0.373737,0.576471,0.577778,0.229508,0.125,0.030303,0.000000,0.191919,0.141414,0.020202,0.010101,0.040404,0.000000,0.010101,0.050505,0.303030,0.000000,0.413210,0.761635,0.117333,0.142667,0.148667,0.166000,0.044796,0.454545,0.202020,0.141414,0.141414,0.050505,0.010101,0.00,0.000000,0.000000,0.363636,0.222222,0.171717,0.171717,0.060606,0.010101,0.00,0.000000,0.000000,0.414141,0.141414,0.252525,0.232323,0.353535,0.656566,0.040404,0.767677,0.212121,0.000000,0.000000,0.000000,0.0,0.00,0.010101,0.020202,0.000000,0.444444,0.344444,0.447368,0.131313,0.595960,0.484848,0.585859,0.404040,0.555556,0.363636,0.555556,0.444444,0.575758,0.909091,0.101010,0.070707,0.060606,0.030303,0.121212,0.141414,0.000000,0.036364,0.101010,0.050505,0.131313,0.131313,0.100000,0.070707,0.040404,0.049180,0.080808,0.242424,0.083333,0.031250,0.030303,0.151515,0.050505,0.030303,0.020202,0.016393,0.080808,0.080808,0.030303,0.060606,0.070707,0.080808,0.040404,0.080808,0.696970,0.040404,0.0,0.705882,0.202020,0.212121,0.393939,0.111111,0.081081,0.030303,0.030303,0.010309,0.242424,0.033333,0.277778,0.040404,0.000000,0.000000,0.000000,0.197183,0.292929,0.033333,0.242424,0.171717,0.383838,0.101010,0.000000,0.030303,0.000000,0.048193,0.0,0.0,0.061224,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.808081,0.989899,0.010101,0.000000,0.010101,0.888889,0.606061,0.232323,0.290323,0.442308,0.010101,0.050505,0.202020,0.454545,0.595960,0.414141,0.0,0.016129,0.666667,0.121212,0.101010,0.010101,0.0,0.111111,0.969697,0.010101,0.565657,0.979798,0.868687,0.380952,0.4,0.070707,0.070707,0.083333,0.052356,0.263158,0.142857,0.000634,0.000000,0.024390,0.0200,0.003003,0.020,0.008272,0.018738,0.599688,0.0,0.000000,0.967213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
2,0.615385,1.0,0.666667,0.357143,0.0,0.381443,0.727273,0.727273,0.818182,0.909091,0.909091,0.0,0.909091,0.916667,0.727273,0.000017,0.608247,0.666667,0.111111,0.020747,0.000000,0.424242,0.161616,0.626263,0.101010,0.020202,0.011494,0.111111,0.067304,0.091154,0.087309,1.000000,0.000000,0.000000,0.454545,0.555556,0.989899,0.000000,0.000000,0.010101,0.020202,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.017544,0.0,0.011628,0.595238,0.702381,0.738095,0.559524,0.642857,0.678571,0.200000,0.505051,0.353535,0.151515,0.101010,0.161616,0.111111,0.080808,0.151515,0.252525,0.151515,0.232323,0.191919,0.303030,0.151515,0.131313,0.464646,0.141414,0.444444,0.292929,0.494949,0.232323,0.101010,0.040404,0.010101,0.606061,0.111111,0.178082,0.151515,0.221538,0.298571,0.454545,0.383838,0.042857,0.404040,0.373737,0.323232,0.020202,0.020202,0.0,0.102667,0.108167,0.307692,0.230769,0.696970,0.313131,0.838384,0.171717,0.575758,0.202020,0.666667,0.545455,0.131313,0.777778,0.232323,0.070707,0.02,0.060606,0.131313,0.262626,0.060606,0.173333,0.434343,0.424242,0.000000,0.000000,0.000000,0.010101,0.010101,0.060606,0.242424,0.757576,0.000000,0.030303,0.191919,0.494118,0.511111,0.196721,0.100,0.171717,0.232323,0.141414,0.040404,0.080808,0.181818,0.010101,0.020202,0.292929,0.777778,0.969697,0.884615,0.201229,0.611805,0.141333,0.156000,0.165333,0.188000,0.068255,0.333333,0.272727,0.191919,0.131313,0.050505,0.010101,0.02,0.000000,0.000000,0.242424,0.282828,0.222222,0.161616,0.070707,0.020202,0.02,0.000000,0.000000,0.454545,0.050505,0.474747,0.151515,0.676768,0.333333,0.323232,0.868687,0.111111,0.000000,0.000000,0.000000,0.0,0.00,0.010101,0.010101,0.012048,0.353535,0.200000,0.328947,0.080808,0.555556,0.464646,0.525253,0.414141,0.484848,0.404040,0.656566,0.484848,0.676768,1.000000,0.070707,0.151515,0.070707,0.060606,0.141414,0.151515,0.000000,0.036364,0.151515,0.040404,0.111111,0.010101,0.071429,0.060606,0.040404,0.000000,0.090909,0.060606,0.083333,0.031250,0.020202,0.232323,0.070707,0.040404,0.050505,0.032787,0.131313,0.070707,0.080808,0.050505,0.101010,0.020202,0.010101,0.060606,0.757576,0.060606,0.0,0.705882,0.080808,0.191919,0.383838,0.181818,0.162162,0.070707,0.030303,0.030928,0.161616,0.033333,0.152778,0.070707,0.000000,0.000000,0.000000,0.281690,0.424242,0.066667,0.161616,0.141414,0.626263,0.090909,0.012048,0.050505,0.066667,0.132530,0.0,0.0,0.122449,0.181818,0.014706,0.050505,0.000000,0.0,0.052632,0.037037,0.03125,0.055556,0.161616,0.909091,0.020202,0.000000,0.080808,0.959596,0.333333,0.060606,0.096774,0.153846,0.070707,0.343434,0.595960,0.848485,0.929293,0.080808,0.1,0.064516,0.000000,0.020202,0.969697,0.010101,0.0,0.010101,0.949495,0.060606,0.909091,1.000000,0.979798,0.285714,0.4,0.040404,0.090909,0.416667,0.314136,0.315789,0.155844,0.006652,0.059322,0.146341,0.0020,0.001001,0.010,0.004596,0.003853,0.237264,1.0,0.666667,0.786885,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.076923,1.0,0.333333,0.428571,0.0,0.216495,0.000000,0.590909,0.909091,0.818182,0.727273,0.0,1.000000,0.916667,0.909091,0.000017,0.783505,0.833333,0.666667,0.037344,0.010101,0.404040,0.232323,0.414141,0.080808,0.030303,0.045977,0.666667,0.014063,0.017853,0.013728,1.000000,0.000000,0.000000,0.494949,0.515152,0.787879,0.010101,0.010101,0.151515,0.111111,0.0,0.013889,0.050505,0.089552,0.021739,0.021277,0.013889,0.072165,0.017544,0.0,0.034884,0.476190,0.583333,0.630952,0.476190,0.571429,0.619048,0.280000,0.343434,0.444444,0.232323,0.101010,0.161616,0.171717,0.141414,0.222222,0.141414,0.060606,0.151515,0.131313,0.353535,0.212121,0.161616,0.343434,0.080808,0.292929,0.151515,0.383838,0.474747,0.313131,0.141414,0.060606,0.636364,0.080808,0.109589,0.212121,0.295385,0.407143,1.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.342000,0.340333,0.692308,0.615385,0.888889,0.121212,1.000000,0.010101,0.000000,0.323232,0.828283,0.717172,0.272727,0.909091,0.101010,0.050505,0.02,0.030303,0.090909,0.151515,0.030303,0.160000,0.444444,0.212121,0.023256,0.010101,0.000000,0.545455,0.898990,0.979798,1.000000,1.000000,0.010101,0.000000,0.606061,0.635294,0.644444,0.245902,0.125,0.000000,0.000000,0.000000,0.121212,0.000000,0.000000,0.000000,0.898990,0.919192,0.939394,0.939394,0.616987,0.099846,0.916005,0.292667,0.310000,0.333333,0.350667,0.114323,0.050505,0.050505,0.232323,0.272727,0.262626,0.101010,0.04,0.000000,0.020202,0.010101,0.050505,0.222222,0.272727,0.323232,0.090909,0.06,0.000000,0.030303,0.323232,0.060606,0.575758,0.020202,0.313131,0.696970,0.060606,0.818182,0.101010,0.040404,0.020202,0.028169,0.0,0.08,0.010101,0.010101,0.000000,0.131313,0.233333,0.315789,0.030303,0.666667,0.646465,0.707071,0.585859,0.686869,0.585859,0.727273,0.636364,0.767677,1.000000,0.050505,0.121212,0.141414,0.050505,0.070707,0.252525,0.000000,0.072727,0.060606,0.010101,0.121212,0.040404,0.028571,0.070707,0.020202,0.000000,0.060606,0.141414,0.133333,0.093750,0.090909,0.191919,0.060606,0.020202,0.040404,0.032787,0.080808,0.070707,0.000000,0.070707,0.080808,0.030303,0.040404,0.030303,0.787879,0.040404,0.0,0.705882,0.060606,0.131313,0.363636,0.212121,0.297297,0.090909,0.030303,0.030928,0.212121,0.033333,0.208333,0.080808,0.010309,0.010101,0.000000,0.281690,0.404040,0.000000,0.232323,0.272727,0.414141,0.070707,0.000000,0.020202,0.000000,0.060241,0.0,0.0,0.122449,0.145455,0.000000,0.010101,0.219512,0.0,0.052632,0.074074,0.00000,0.188889,0.545455,0.828283,0.040404,0.090909,0.050505,1.000000,0.777778,0.393939,0.612903,0.673077,0.000000,0.000000,0.000000,0.010101,0.141414,0.868687,0.0,0.000000,0.888889,0.030303,0.050505,0.000000,0.0,0.030303,1.000000,0.000000,1.000000,1.000000,1.000000,0.428571,0.4,0.141414,0.090909,0.433333,0.314136,0.315789,0.155844,0.015099,0.033898,0.170732,0.0100,0.004004,0.020,0.003676,0.016180,0.863723,1.0,0.000000,0.131148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,0.500000,1.0,0.333333,0.785714,0.0,0.443299,0.000000,0.863636,0.000000,0.909091,0.272727,0.0,0.272727,0.979167,0.000000,0.000052,0.556701,0.666667,0.222222,0.087137,0.333333,0.272727,0.292929,0.181818,0.121212,0.020202,0.080460,1.000000,0.010040,0.012052,0.010988,0.939394,0.000000,0.070707,0.505051,0.505051,0.888889,0.080808,0.000000,0.030303,0.040404,0.0,0.000000,0.000000,0.014925,0.021739,0.000000,0.000000,0.020619,0.017544,0.0,0.011628,0.309524,0.380952,0.416667,0.321429,0.428571,0.476190,0.400000,0.545455,0.343434,0.121212,0.202020,0.383838,0.161616,0.131313,0.080808,0.040404,0.010101,0.222222,0.242424,0.343434,0.131313,0.070707,0.060606,0.020202,0.060606,0.242424,0.313131,0.454545,0.252525,0.080808,0.010101,0.626263,0.131313,0.041096,0.222222,0.281538,0.364286,0.222222,0.222222,0.014286,0.272727,0.262626,0.252525,0.000000,0.000000,0.0,0.102500,0.100167,0.307692,0.230769,0.686869,0.323232,0.969697,0.040404,0.191919,0.444444,0.696970,0.555556,0.333333,0.838384,0.171717,0.111111,0.02,0.101010,0.212121,0.131313,0.070707,0.320000,0.595960,0.050505,0.069767,0.050505,0.000000,0.000000,0.000000,0.000000,0.070707,0.838384,0.000000,0.030303,0.222222,0.517647,0.533333,0.229508,0.125,0.060606,0.202020,0.515152,0.040404,0.020202,0.242424,0.030303,0.010101,0.161616,0.919192,0.959596,0.632479,0.462366,0.740068,0.164667,0.172000,0.179333,0.182667,0.064267,0.222222,0.292929,0.282828,0.131313,0.050505,0.010101,0.02,0.000000,0.000000,0.212121,0.252525,0.363636,0.080808,0.080808,0.000000,0.04,0.000000,0.000000,0.101010,0.040404,0.434343,0.151515,0.777778,0.232323,0.353535,0.808081,0.151515,0.000000,0.000000,0.000000,0.0,0.00,0.000000,0.060606,0.000000,0.272727,0.155556,0.250000,0.000000,0.505051,0.868687,0.929293,0.797980,0.878788,0.717172,0.636364,0.555556,0.919192,1.000000,0.020202,0.111111,0.040404,0.050505,0.101010,0.232323,0.000000,0.054545,0.202020,0.020202,0.090909,0.030303,0.128571,0.020202,0.020202,0.000000,0.040404,0.070707,0.116667,0.046875,0.060606,0.181818,0.070707,0.101010,0.000000,0.032787,0.111111,0.101010,0.090909,0.050505,0.121212,0.020202,0.070707,0.090909,0.646465,0.070707,0.0,0.723529,0.050505,0.121212,0.313131,0.343434,0.108108,0.141414,0.000000,0.041237,0.424242,0.100000,0.277778,0.232323,0.195876,0.333333,0.051282,0.225352,0.272727,0.200000,0.292929,0.111111,0.181818,0.545455,0.024096,0.030303,0.000000,0.216867,0.0,0.0,0.102041,0.054545,0.000000,0.020202,0.000000,0.0,0.000000,0.000000,0.00000,0.066667,0.393939,0.898990,0.010101,0.030303,0.060606,0.979798,0.484848,0.090909,0.096774,0.307692,0.030303,0.171717,0.505051,0.797980,0.919192,0.090909,0.0,0.000000,0.898990,0.020202,0.101010,0.000000,0.0,0.000000,1.000000,0.000000,1.000000,1.000000,0.989899,0.285714,0.0,0.060606,0.040404,0.183333,0.141361,0.210526,0.103896,0.001795,0.008475,0.024390,0.0050,0.002002,0.015,0.004596,0.009059,0.555564,1.0,0.000000,0.573770,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76324,0.211538,1.0,0.000000,0.214286,0.0,0.278351,0.636364,0.909091,1.000000,0.863636,0.818182,1.0,0.090909,0.895833,1.000000,0.000000,0.711340,0.333333,1.000000,0.020747,0.020202,0.373737,0.101010,0.323232,0.262626,0.010101,0.022989,1.000000,0.011732,0.015919,0.011214,1.000000,0.000000,0.000000,0.505051,0.505051,0.989899,0.000000,0.000000,0.010101,0.040404,0.0,0.000000,0.010101,0.000000,0.000000,0.000000,0.000000,0.020619,0.000000,0.0,0.011628,0.476190,0.547619,0.583333,0.464286,0.559524,0.607143,0.266667,0.333333,0.353535,0.323232,0.131313,0.161616,0.171717,0.191919,0.171717,0.131313,0.050505,0.141414,0.111111,0.292929,0.222222,0.242424,0.282828,0.050505,0.262626,0.090909,0.373737,0.545455,0.333333,0.101010,0.030303,0.656566,0.040404,0.068493,0.262626,0.336923,0.417143,0.898990,0.848485,0.128571,0.090909,0.000000,0.000000,0.000000,0.000000,0.0,0.150333,0.155167,0.384615,0.384615,0.929293,0.080808,1.000000,0.010101,0.000000,0.333333,0.898990,0.808081,0.313131,0.959596,0.050505,0.020202,0.02,0.010101,0.060606,0.121212,0.020202,0.200000,0.636364,0.212121,0.000000,0.000000,0.000000,0.000000,0.010101,0.282828,0.848485,0.989899,0.000000,0.000000,0.808081,0.729412,0.722222,0.262295,0.100,0.090909,0.000000,0.000000,0.030303,0.040404,0.000000,0.000000,0.595960,0.757576,0.979798,1.000000,0.542735,0.170507,0.700341,0.332000,0.344000,0.350000,0.365333,0.097970,0.030303,0.080808,0.161616,0.232323,0.363636,0.101010,0.00,0.065574,0.000000,0.030303,0.050505,0.141414,0.242424,0.383838,0.111111,0.00,0.040404,0.000000,0.262626,0.040404,0.676768,0.020202,0.202020,0.808081,0.010101,0.858586,0.101010,0.000000,0.000000,0.000000,0.0,0.00,0.050505,0.000000,0.024096,0.030303,0.166667,0.223684,0.010101,0.515152,0.717172,0.797980,0.636364,0.767677,0.626263,0.737374,0.696970,1.000000,0.000000,0.000000,0.131313,0.131313,0.040404,0.131313,0.202020,0.000000,0.218182,0.080808,0.000000,0.060606,0.050505,0.028571,0.040404,0.000000,0.000000,0.020202,0.222222,0.083333,0.015625,0.010101,0.161616,0.030303,0.050505,0.020202,0.049180,0.161616,0.070707,0.020202,0.181818,0.262626,0.010101,0.020202,0.000000,0.636364,0.090909,0.0,0.741176,0.040404,0.080808,0.333333,0.313131,0.135135,0.121212,0.060606,0.092784,0.161616,0.033333,0.194444,0.090909,0.010309,0.020202,0.000000,0.281690,0.373737,0.000000,0.101010,0.343434,0.323232,0.222222,0.012048,0.000000,0.000000,0.156627,0.0,0.0,0.000000,0.090909,0.029412,0.303030,0.000000,0.0,0.000000,0.000000,0.00000,0.066667,0.878788,0.828283,0.040404,0.000000,0.141414,1.000000,0.787879,0.343434,0.451613,0.403846,0.020202,0.030303,0.101010,0.454545,0.888889,0.121212,0.0,0.000000,0.969697,0.000000,0.020202,0.020202,0.0,0.000000,1.000000,0.000000,1.000000,1.000000,1.000000,0.428571,0.8,0.080808,0.101010,0.516667,0.497382,0.315789,0.480519,0.018477,0.084746,0.219512,0.0050,0.003003,0.010,0.000919,0.007724,0.061154,1.0,0.333333,0.049180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
76325,0.634615,1.0,0.000000,0.928571,0.0,0.329897,0.000000,0.954545,0.090909,0.954545,0.090909,0.5,0.090909,1.000000,0.090909,0.000034,0.670103,0.333333,0.666667,0.000000,0.000000,0.363636,0.111111,0.626263,0.040404,0.161616,0.000000,1.000000,0.010344,0.014660,0.011722,0.000000,0.000000,1.000000,0.505051,0.505051,0.959596,0.050505,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.464286,0.535714,0.571429,0.464286,0.547619,0.583333,0.266667,0.333333,0.414141,0.262626,0.121212,0.181818,0.181818,0.212121,0.151515,0.121212,0.040404,0.141414,0.151515,0.313131,0.232323,0.171717,0.232323,0.050505,0.212121,0.191919,0.444444,0.373737,0.202020,0.060606,0.010101,0.696970,0.101010,0.054795,0.181818,0.261538,0.350000,0.666667,0.666667,0.000000,0.202020,0.202020,0.121212,0.000000,0.000000,0.0,0.169000,0.194500,0.230769,0.230769,0.858586,0.151515,0.757576,0.252525,0.202020,0.292929,0.797980,0.696970,0.252525,0.898990,0.111111,0.040404,0.02,0.030303,0.121212,0.151515,0.030303,0.186667,0.626263,0.191919,0.023256,0.030303,0.000000,0.101010,0.222222,0.515152,0.727273,0.878788,0.020202,0.050505,0.444444,0.635294,0.600000,0.229508,0.075,0.202020,0.000000,0.141414,0.040404,0.040404,0.020202,0.050505,0.050505,0.202020,0.414141,0.828283,0.000000,0.336406,0.570942,0.245333,0.257333,0.328667,0.364000,0.107023,0.090909,0.141414,0.191919,0.262626,0.141414,0.080808,0.12,0.081967,0.000000,0.050505,0.080808,0.222222,0.252525,0.161616,0.101010,0.14,0.060606,0.000000,0.282828,0.020202,0.575758,0.000000,0.282828,0.727273,0.010101,0.767677,0.212121,0.000000,0.000000,0.000000,0.0,0.00,0.010101,0.020202,0.060241,0.080808,0.177778,0.263158,0.030303,0.545455,0.707071,0.797980,0.626263,0.777778,0.575758,0.919192,0.888889,0.737374,0.000000,0.020202,0.181818,0.161616,0.040404,0.191919,0.171717,0.000000,0.000000,0.050505,0.000000,0.171717,0.000000,0.028571,0.020202,0.000000,0.000000,0.050505,0.212121,0.000000,0.125000,0.000000,0.181818,0.070707,0.030303,0.020202,0.016393,0.121212,0.131313,0.060606,0.040404,0.040404,0.161616,0.000000,0.121212,0.646465,0.040404,0.0,0.823529,0.030303,0.121212,0.212121,0.191919,0.216216,0.232323,0.131313,0.020619,0.212121,0.033333,0.208333,0.080808,0.000000,0.000000,0.000000,0.253521,0.363636,0.000000,0.111111,0.141414,0.626263,0.050505,0.000000,0.141414,0.000000,0.036145,0.0,0.0,0.224490,0.018182,0.000000,0.000000,0.000000,0.0,0.263158,0.000000,0.00000,0.000000,0.747475,0.989899,0.000000,0.000000,0.020202,1.000000,0.818182,0.414141,0.354839,0.307692,0.020202,0.151515,0.303030,0.676768,0.949495,0.060606,0.0,0.032258,0.030303,0.303030,0.636364,0.020202,0.0,0.020202,0.585859,0.424242,0.222222,1.000000,1.000000,0.380952,0.4,0.111111,0.090909,0.083333,0.052356,0.263158,0.142857,0.000634,0.000000,0.024390,0.0200,0.003003,0.020,0.008272,0.018738,0.157140,1.0,0.000000,0.245902,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
76326,0.865385,1.0,0.333333,0.285714,0.0,0.000000,0.090909,0.545455,0.000000,0.863636,1.000000,0.5,0.000000,0.906250,0.000000,0.000017,0.624862,0.666667,1.000000,0.000000,0.000000,0.353535,0.292929,0.424242,0.171717,0.070707,0.034483,0.444444,0.045511,0.055583,0.051606,0.000000,0.717172,0.292929,0.494949,0.515152,0.979798,0.000000,0.030303,0.000000,0.010101,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010309,0.000000,0.0,0.000000,0.452381,0.571429,0.595238,0.476190,0.595238,0.619048,0.346667,0.313131,0.434343,0.262626,0.070707,0.171717,0.202020,0.151515,0.141414,0.131313,0.131313,0.111111,0.141414,0.353535,0.242424,0.161616,0.313131,0.151515,0.303030,0.303030,0.363636,0.353535,0.212121,0.080808,0.020202,0.606061,0.121212,0.150685,0.161616,0.241538,0.337143,0.656566,0.646465,0.057143,0.151515,0.111111,0.060606,0.040404,0.030303,0.0,0.088833,0.097667,0.153846,0.153846,0.676768,0.333333,0.818182,0.191919,0.070707,0.333333,0.686869,0.575758,0.252525,0.818182,0.191919,0.080808,0.04,0.060606,0.141414,0.212121,0.030303,0.266667,0.525253,0.252525,0.000000,0.000000,0.000000,0.010101,0.010101,0.070707,0.191919,0.565657,0.000000,0.070707,0.363636,0.588235,0.566667,0.213115,0.100,0.111111,0.040404,0.191919,0.161616,0.070707,0.050505,0.040404,0.010101,0.050505,0.222222,0.505051,0.000000,0.723502,0.870602,0.164667,0.201333,0.194667,0.232000,0.070850,0.282828,0.222222,0.202020,0.171717,0.080808,0.030303,0.02,0.000000,0.010101,0.151515,0.222222,0.242424,0.222222,0.111111,0.030303,0.04,0.000000,0.010101,0.333333,0.050505,0.363636,0.111111,0.525253,0.484848,0.121212,0.717172,0.151515,0.000000,0.000000,0.000000,0.0,0.00,0.090909,0.040404,0.000000,0.060606,0.088889,0.184211,0.030303,0.242424,0.646465,0.727273,0.565657,0.696970,0.535354,0.767677,0.606061,0.838384,1.000000,0.030303,0.121212,0.131313,0.030303,0.090909,0.121212,0.023256,0.018182,0.161616,0.070707,0.121212,0.030303,0.114286,0.020202,0.080808,0.163934,0.050505,0.040404,0.066667,0.015625,0.020202,0.171717,0.050505,0.020202,0.060606,0.032787,0.111111,0.121212,0.101010,0.030303,0.161616,0.070707,0.030303,0.111111,0.525253,0.111111,0.0,0.705882,0.101010,0.141414,0.353535,0.212121,0.162162,0.101010,0.050505,0.000000,0.262626,0.100000,0.277778,0.040404,0.000000,0.000000,0.000000,0.239437,0.353535,0.033333,0.282828,0.161616,0.434343,0.070707,0.012048,0.080808,0.033333,0.168675,0.0,0.0,0.102041,0.036364,0.029412,0.010101,0.000000,0.0,0.052632,0.037037,0.00000,0.011111,0.494949,0.969697,0.020202,0.000000,0.030303,0.939394,0.585859,0.202020,0.193548,0.519231,0.000000,0.020202,0.151515,0.434343,0.555556,0.454545,0.0,0.000000,0.686869,0.070707,0.151515,0.010101,0.0,0.080808,0.858586,0.111111,0.767677,0.989899,0.929293,0.380952,0.4,0.070707,0.080808,0.466667,0.361257,0.315789,0.155844,0.011403,0.067797,0.268293,0.0030,0.001201,0.008,0.010110,0.005898,0.692061,0.0,1.000000,0.950820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
76327,0.730769,1.0,0.333333,0.214286,0.0,0.216495,0.545455,0.545455,0.909091,0.909091,1.000000,0.5,0.181818,0.895833,0.727273,0.000017,0.773196,0.666667,0.333333,0.004149,0.000000,0.595960,0.222222,0.313131,0.030303,0.030303,0.000000,0.333333,0.008044,0.010073,0.007965,0.000000,0.000000,1.000000,0.474747,0.535354,0.767677,0.232323,0.000000,0.000000,0.040404,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.030928,0.017544,0.0,0.000000,0.380952,0.488095,0.523810,0.392857,0.511905,0.559524,0.400000,0.373737,0.373737,0.262626,0.141414,0.191919,0.252525,0.171717,0.121212,0.101010,0.030303,0.151515,0.151515,0.292929,0.222222,0.191919,0.212121,0.080808,0.181818,0.171717,0.373737,0.464646,0.272727,0.141414,0.060606,0.565657,0.161616,0.082192,0.222222,0.290769,0.397143,0.656566,0.656566,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.077167,0.084500,0.307692,0.230769,0.878788,0.131313,0.959596,0.050505,0.333333,0.424242,0.797980,0.585859,0.272727,0.898990,0.111111,0.161616,0.06,0.131313,0.111111,0.161616,0.040404,0.200000,0.494949,0.121212,0.186047,0.121212,0.024691,0.010101,0.010101,0.010101,0.040404,0.434343,0.010101,0.020202,0.363636,0.576471,0.577778,0.229508,0.125,0.000000,0.000000,0.343434,0.080808,0.000000,0.000000,0.060606,0.030303,0.191919,0.545455,0.848485,0.618590,0.505376,0.606129,0.184667,0.182000,0.177333,0.170667,0.056399,0.252525,0.181818,0.292929,0.222222,0.050505,0.000000,0.00,0.000000,0.000000,0.252525,0.191919,0.333333,0.202020,0.030303,0.000000,0.00,0.000000,0.000000,0.292929,0.020202,0.232323,0.131313,0.434343,0.575758,0.050505,0.828283,0.080808,0.000000,0.000000,0.000000,0.0,0.00,0.060606,0.030303,0.000000,0.030303,0.188889,0.250000,0.000000,0.636364,0.636364,0.767677,0.515152,0.767677,0.515152,0.606061,0.303030,0.898990,0.000000,0.050505,0.030303,0.090909,0.020202,0.212121,0.141414,0.000000,0.000000,0.121212,0.000000,0.161616,0.030303,0.142857,0.101010,0.000000,0.000000,0.080808,0.090909,0.116667,0.000000,0.070707,0.353535,0.040404,0.070707,0.020202,0.000000,0.070707,0.060606,0.070707,0.000000,0.030303,0.030303,0.000000,0.141414,0.808081,0.000000,0.0,0.705882,0.121212,0.303030,0.404040,0.090909,0.081081,0.010101,0.040404,0.061856,0.181818,0.100000,0.236111,0.040404,0.000000,0.000000,0.000000,0.422535,0.595960,0.066667,0.222222,0.131313,0.313131,0.222222,0.000000,0.030303,0.000000,0.036145,0.0,0.0,0.204082,0.000000,0.000000,0.000000,0.000000,0.0,0.052632,0.000000,0.00000,0.022222,0.474747,0.979798,0.010101,0.000000,0.010101,1.000000,0.565657,0.090909,0.193548,0.192308,0.000000,0.232323,0.565657,0.888889,0.888889,0.121212,0.0,0.000000,0.000000,0.171717,0.666667,0.070707,0.0,0.090909,0.000000,1.000000,0.000000,1.000000,0.959596,0.238095,0.4,0.060606,0.070707,0.483333,0.350785,0.315789,0.168831,0.013462,0.093220,0.292683,0.0015,0.007007,0.005,0.002757,0.004873,0.233499,1.0,1.000000,0.606557,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0


In [11]:
from sklearn.utils import resample

category_0 = trainset[trainset['TARGET_B'] == 0]
category_1 = trainset[trainset['TARGET_B'] == 1]

## Apply the Random Forests algorithm but this time only by upscaling the data.

In [12]:
category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

In [13]:
data_upsampled = pd.concat([category_0, category_1_oversampled], axis=0)

In [14]:
X_train_upsampled = data_upsampled.drop(['TARGET_B'], axis =1)
y_train_upsampled = data_upsampled['TARGET_B'] 

In [15]:
def model_eval(model,X_train,y_train,X_test,y_test):
    # Fitting
    model.fit(X_train, y_train)
    # Predicting
    predictions = model.predict(X_test)
    # Calculating confusion matrix
    cm =confusion_matrix (y_test, predictions)
    # Printing different evaluation metrics
    print ( str(model.base_estimator_)[:-2] + " score: ", model.score(X_test, y_test))
    print ( str(model.base_estimator_)[:-2] + " precision: ", precision_score(y_test, predictions))
    print ( str(model.base_estimator_)[:-2] + " recall: ", recall_score(y_test, predictions))
    print ( str(model.base_estimator_)[:-2] + " score: ", f1_score(y_test, predictions))
    print ('\n')
    print (cm)
    return model

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score
from sklearn.model_selection import cross_val_score
# Fit the Random Forests algorithm on the training data
rfc = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 0)
# rfc.fit(X_train_upsampled, y_train_upsampled)
# print(rfc.score(X_train_upsampled,y_train_upsampled))
# print(rfc.score(X_test,y_test))
# # Make predictions on the test data
# y_pred = rfc.predict(X_test)

In [17]:
model_eval(rfc,X_train_upsampled,y_train_upsampled,X_test_scaled,y_test)

DecisionTreeClassifier score:  0.5897395587695855
DecisionTreeClassifier precision:  0.07023285084959094
DecisionTreeClassifier recall:  0.558
DecisionTreeClassifier score:  0.1247624371157071


[[10696  7387]
 [  442   558]]


RandomForestClassifier(max_depth=5, max_samples=0.2, min_samples_leaf=20,
                       min_samples_split=20, random_state=0)

## Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)

In [18]:
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

In [19]:
X_added_constant = sm.add_constant(X_train_upsampled)
X_added_constant
model = sm.OLS(y_train_upsampled,X_added_constant).fit()
model.summary()
# Drop columns with pvalue >= 0.8 

0,1,2,3
Dep. Variable:,TARGET_B,R-squared:,0.074
Model:,OLS,Adj. R-squared:,0.072
Method:,Least Squares,F-statistic:,32.76
Date:,"Fri, 07 Apr 2023",Prob (F-statistic):,0.0
Time:,17:33:31,Log-Likelihood:,-99628.0
No. Observations:,144972,AIC:,200000.0
Df Residuals:,144617,BIC:,203500.0
Df Model:,354,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.9205,0.213,9.035,0.000,1.504,2.337
CLUSTER,0.2717,0.043,6.363,0.000,0.188,0.355
DATASRCE,-0.0148,0.005,-3.217,0.001,-0.024,-0.006
DOMAIN_B,-0.0950,0.013,-7.058,0.000,-0.121,-0.069
ODATEW_YR,0.0547,0.031,1.789,0.074,-0.005,0.115
ODATEW_MM,0.2538,0.056,4.538,0.000,0.144,0.363
DOB_YR,-0.0190,0.007,-2.646,0.008,-0.033,-0.005
DOB_MM,0.0408,0.005,8.091,0.000,0.031,0.051
MINRDATE_YR,0.0584,0.017,3.338,0.001,0.024,0.093

0,1,2,3
Omnibus:,612478.161,Durbin-Watson:,0.148
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17783.751
Skew:,-0.008,Prob(JB):,0.0
Kurtosis:,1.284,Cond. No.,39800.0


In [20]:
drop_list_sm = ['MAXRDATE_MM', 'MALEVET', 'FEDGOV','WEALTH2','HUPA4', 'HHN1', 'HUPA6', 'ADI', 'IC4', 'IC22', 'TPE11', 'OCC13','EIC4', 'EIC8', 'EIC16','EC2', 'EC5','ANC6', 'HC6', 'HC11', 'HC12','HC15']

In [21]:
# Use variance threshold to remove low-variance features
vt = VarianceThreshold(threshold=0.02)
X_vt = vt.fit(X_train_upsampled)
X_vt_transform = vt.transform(X_train_upsampled)

var_list = list(X_vt.get_support())
list(zip(X_train_upsampled.columns, var_list))
[col[0] for col in zip(X_train_upsampled.columns, var_list) if col[1] == False]

removed_columns_vt = pd.DataFrame(data=(X_train_upsampled.columns,X_vt.variances_,X_vt.get_support()), index=('column_name','variance','statement')).T
removed_columns_vt = removed_columns_vt.loc[(removed_columns_vt['statement'] == False),:]
drop_list_vt = list(removed_columns_vt['column_name'])

#### I have drop list so I will drop them from original data and then I will try random forest again

In [22]:
drop_list_end = drop_list_vt + drop_list_sm

In [23]:
drop_list_end = list(set(drop_list_end))
drop_list_end

['HHD10',
 'ETH6',
 'CHIL3',
 'IC10',
 'CHILC1',
 'RAMNTALL',
 'ETH16',
 'AGEC7',
 'ETH11',
 'EIC3',
 'EIC9',
 'HC12',
 'PEC1',
 'POP903',
 'EIC12',
 'ETHC4',
 'HHAS4',
 'ETH14',
 'POP90C4',
 'HHAGE2',
 'IC21',
 'HC9',
 'ETHC2',
 'IC5',
 'OCC1',
 'ANC13',
 'ANC10',
 'AGEC3',
 'ETHC5',
 'NUMPROM',
 'HC1',
 'EC1',
 'DW7',
 'OCC11',
 'HU4',
 'TPE6',
 'AGEC5',
 'SEC3',
 'AFC6',
 'TPE12',
 'IC17',
 'HHN2',
 'HHD9',
 'HHD12',
 'IC13',
 'TIMELAG',
 'TPE9',
 'LFC10',
 'TPE8',
 'OCC13',
 'SEC4',
 'ADI',
 'ANC11',
 'HHAGE3',
 'EIC5',
 'ANC3',
 'ANC8',
 'AGEC1',
 'HHN1',
 'HC15',
 'ANC5',
 'ANC12',
 'AGE906',
 'TPE11',
 'HC14',
 'FEDGOV',
 'OEDC2',
 'RHP2',
 'OCC8',
 'HHAS2',
 'IC3',
 'HC10',
 'RHP3',
 'EC5',
 'AGE905',
 'LSC4',
 'EIC4',
 'ETH15',
 'CHILC4',
 'HHN5',
 'HC3',
 'HU3',
 'IC1',
 'HC16',
 'HUPA4',
 'DW8',
 'EIC14',
 'ETH13',
 'CARDPM12',
 'WEALTH2',
 'MAXRDATE_MM',
 'TPE10',
 'HHD5',
 'AC1',
 'IC15',
 'IC11',
 'FIRSTDATE_YR',
 'EIC15',
 'OEDC3',
 'LASTGIFT',
 'HUPA7',
 'CHILC5',
 'ODA

In [24]:
X_train_upsampled_sm= X_train_upsampled.drop(columns = drop_list_end)

In [25]:
X_test_sm = X_test_scaled.drop(columns = drop_list_end)

In [26]:
model_eval(rfc,X_train_upsampled_sm,y_train_upsampled,X_test_sm,y_test)

DecisionTreeClassifier score:  0.6012157417596814
DecisionTreeClassifier precision:  0.07354838709677419
DecisionTreeClassifier recall:  0.57
DecisionTreeClassifier score:  0.13028571428571428


[[10903  7180]
 [  430   570]]


RandomForestClassifier(max_depth=5, max_samples=0.2, min_samples_leaf=20,
                       min_samples_split=20, random_state=0)

We found improvement by reducing the number of features which means that there are still irrelevant features. For this reason, we use different feature selection tools below

In [27]:
# Use PCA to reduce the dimensionality of the data
pca = PCA(n_components=5)
X_pca = pca.fit(X_train_upsampled)
X_pca_transform = pca.transform(X_train_upsampled)
X_test_pca = pca.transform(X_test_scaled)

In [49]:
final_mode = model_eval(rfc,X_pca_transform,y_train_upsampled,X_test_pca,y_test)

DecisionTreeClassifier score:  0.6215479746371115
DecisionTreeClassifier precision:  0.06743604004449388
DecisionTreeClassifier recall:  0.485
DecisionTreeClassifier score:  0.118408203125


[[11376  6707]
 [  515   485]]


##  I decided to use the model with PCA

In [36]:
numerical =  X.select_dtypes(np.number)
categorical = X.select_dtypes(object)

In [45]:
categorical

Unnamed: 0,STATE,HOMEOWNR,GENDER,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A
0,IL,H,F,L,E,C,T
1,CA,H,M,L,G,A,S
2,NC,U,M,L,E,C,R
3,CA,U,F,L,E,C,R
4,FL,H,F,L,F,A,S
...,...,...,...,...,...,...,...
95407,other,H,M,L,G,C,C
95408,TX,H,M,L,F,A,C
95409,MI,H,M,L,E,B,C
95410,CA,H,F,L,F,A,C


In [42]:
num_scaled = pd.DataFrame(scaler.transform(numerical), index = numerical.index)

In [46]:
encoded = pd.DataFrame(enc.transform(categorical).toarray(), columns = column_names, index = categorical.index)

In [47]:
X_all = pd.concat([encoded, num_scaled], axis=1)

In [52]:
clf = final_mode.fit ( X_pca_transform,y_train_upsampled )

In [54]:
clf.predict(X_all)



ValueError: X has 354 features, but RandomForestClassifier is expecting 5 features as input.

Using PCA I get my best score.

In [29]:
import pickle
# # Use RFE to select the top k features
# rfe = RFE(estimator=rfc, n_features_to_select=100, step=1)
# X_rfe = rfe.fit_transform(X_train_upsampled, y_train_upsampled)
X_rfe = pickle.load(open("X_rfe.pkl", "rb"))
# Model performance using RFE
scores_rfe = cross_val_score(rfc, X_rfe, y_train_upsampled, cv=5, scoring='accuracy')
print('Accuracy using RFE:', scores_rfe.mean())

Accuracy using RFE: 0.6113249523425075


After using different tools, the accurancy gets worse, so I stick with the previous model.

### Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the business?

in this case it would be very different if there was a false positive than if there was a false negative. A false positive means that resources would be wasted on someone who is not likely to donate, whereas, in the case of a false negative, we would be losing a potential donor.

The best way to improve the model would be to reduce false positives in order to maximise the utility of our resources.  