In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import VarianceThreshold

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')

data = pd.concat([numerical, categorical, targets], axis=1)

In [3]:
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [4]:
X = data.drop(['TARGET_B'], axis=1)
y = data['TARGET_B']

In [5]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(76329, 338)
(19083, 338)
(76329,)
(19083,)


In [6]:
# Split in categorical and numerical
numericalX_train   = X_train.select_dtypes(np.number)
numericalX_test    = X_test.select_dtypes(np.number)
categoricalX_train = X_train.select_dtypes(object) 
categoricalX_test  = X_test.select_dtypes(object) 

In [7]:
from sklearn.preprocessing import OneHotEncoder  #encode categorical
#categoricalX_train encoding
encoder = OneHotEncoder(drop='first').fit(categoricalX_train)
encoded_categorical_train = encoder.transform(categoricalX_train).toarray()
encoded_categorical_train = pd.DataFrame(encoded_categorical_train, columns=encoder.get_feature_names_out())
#categoricalX_test encoding
encoded_categorical_test = encoder.transform(categoricalX_test).toarray()
encoded_categorical_test = pd.DataFrame(encoded_categorical_test, columns=encoder.get_feature_names_out())

In [8]:
from sklearn.preprocessing import MinMaxScaler   #scale numerical
#numericalX_train scaling
transformer = MinMaxScaler().fit(numericalX_train)
scaled_numerical_train = transformer.transform(numericalX_train)
scaled_numerical_train = pd.DataFrame(scaled_numerical_train, columns=numericalX_train.columns)
#numericalX_test scaling
scaled_numerical_test  = transformer.transform(numericalX_test)
scaled_numerical_test  = pd.DataFrame(scaled_numerical_test, columns=numericalX_train.columns)

In [9]:
#Concatenating the train and the test
X_train_treated = pd.concat([scaled_numerical_train, encoded_categorical_train], axis = 1)
X_test_treated  = pd.concat([scaled_numerical_test,  encoded_categorical_test],  axis = 1)

In [10]:
display(X_train_treated.head())
display(y_train)

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,1.7e-05,0.762887,0.5,0.666667,0.008299,0.0,0.313131,0.10101,0.686869,0.060606,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.536082,0.666667,1.0,0.0,0.0,0.292929,0.242424,0.383838,0.070707,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1.7e-05,0.608247,0.666667,0.111111,0.020747,0.0,0.424242,0.161616,0.626263,0.10101,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.7e-05,0.783505,0.833333,0.666667,0.037344,0.010101,0.40404,0.232323,0.414141,0.080808,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5.2e-05,0.556701,0.666667,0.222222,0.087137,0.333333,0.272727,0.292929,0.181818,0.121212,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


79401    0
86429    0
76729    1
38838    0
83012    0
        ..
21243    0
45891    0
42613    1
43567    0
68268    0
Name: TARGET_B, Length: 76329, dtype: int64

In [11]:
X_test.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_D
58053,28,65.0,6,9,0,0,24,42,22,9,...,8,91,10,95,11,95,11,91,9,0.0
9484,1,58.0,5,9,0,0,17,28,42,10,...,5,90,8,95,1,95,4,87,4,0.0
13395,1,46.0,7,8,1,0,28,44,22,2,...,3,91,10,95,10,95,10,91,10,0.0
1466,0,61.611649,5,9,0,0,33,37,49,8,...,2,91,2,89,2,96,2,86,11,0.0
2076,0,46.0,3,2,0,49,16,32,3,9,...,1,96,2,96,2,96,2,96,2,0.0


In [12]:
y_test

58053    0
9484     0
13395    0
1466     0
2076     0
        ..
94255    0
26449    0
1969     0
48574    0
73270    0
Name: TARGET_B, Length: 19083, dtype: int64

In [13]:
# Apply the Random Forest Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train_treated, y_train)
print(clf.score(X_train_treated, y_train))
print(clf.score(X_test_treated, y_test))

y_pred = clf.predict(X_test_treated)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.9496521636599458
0.9475973379447676


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[18083,     0],
       [ 1000,     0]], dtype=int64)

In [14]:
# Checking for imbalanced data
import matplotlib.pyplot as plt
import pandas as pd

print("Class Distribution:")
print(y_train.value_counts())

Class Distribution:
0    72486
1     3843
Name: TARGET_B, dtype: int64


In [15]:
#We have less data in 1, so we need to upsample
trainset = pd.concat([X_train_treated, y_train.reset_index(drop=True)], axis=1)
category_1_upsampled = trainset[trainset['TARGET_B'] == 1].sample(len(trainset[trainset['TARGET_B'] == 0]), replace=True) #the 1 needs to be the same length as the 0
print(category_1_upsampled.shape)
category_0 = trainset[trainset['TARGET_B'] == 0]
print(category_0.shape)

# Concatenate the 1 upsample with the 0
trainset_new_upsampled = pd.concat([category_0, category_1_upsampled], axis=0)
trainset_new_upsampled = trainset_new_upsampled.sample(frac=1)  # randomize the rows
X_train_treated_upsampled = trainset_new_upsampled.drop(['TARGET_B'], axis=1)
y_train_upsampled = trainset_new_upsampled['TARGET_B']
print(X_train_treated_upsampled.shape)

(72486, 356)
(72486, 356)
(144972, 355)


In [16]:
# Now we can remove the column target d from the set of features
# reverting back to the name X_train
X_train_treated_upsampled = X_train_treated_upsampled.drop(['TARGET_D'], axis = 1).copy()
X_test_treated  = X_test_treated.drop(['TARGET_D'], axis = 1)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train_treated_upsampled, y_train_upsampled)
print(clf.score(X_train_treated_upsampled, y_train_upsampled))
print(clf.score(X_test_treated, y_test))
y_pred = clf.predict(X_test_treated)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6214717324724774
0.6006393124770738


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[10909,  7174],
       [  447,   553]], dtype=int64)

### Feature Selection

In [18]:
from sklearn.preprocessing import MinMaxScaler
transformer = MinMaxScaler().fit(numerical)

numerical_scaled = transformer.transform(numerical)
numerical_scaled = pd.DataFrame(numerical_scaled, columns=numerical.columns)
numerical_scaled

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0.000000,0.608247,0.666667,1.000000,0.000000,0.000000,0.393939,0.343434,0.181818,0.101010,...,0.341463,0.005,0.001401,0.010,0.003676,0.006465,0.498045,0.0,1.000000,0.622951
1,0.000014,0.463918,0.833333,1.000000,0.066390,0.000000,0.151515,0.555556,0.111111,0.060606,...,0.024390,0.010,0.004004,0.025,0.016544,0.014399,0.774510,0.0,0.333333,0.000000
2,0.000014,0.624862,0.333333,0.111111,0.008299,0.000000,0.202020,0.292929,0.333333,0.060606,...,0.341463,0.002,0.002202,0.005,0.011029,0.006204,0.078617,1.0,1.000000,0.967213
3,0.000000,0.711340,0.000000,0.444444,0.008299,0.000000,0.232323,0.141414,0.313131,0.030303,...,0.170732,0.002,0.001201,0.010,0.008272,0.005534,0.899764,1.0,1.000000,0.655738
4,0.000000,0.793814,0.333333,0.222222,0.248963,0.010101,0.282828,0.090909,0.535354,0.262626,...,0.195122,0.003,0.002002,0.015,0.012868,0.005586,0.037079,1.0,0.333333,0.409836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.000014,0.624862,0.666667,1.000000,0.000000,0.141414,0.363636,0.474747,0.111111,0.070707,...,0.000000,0.025,0.004004,0.025,0.008272,0.023745,0.962399,0.0,0.000000,0.180328
95408,0.000014,0.484536,1.000000,1.000000,0.004149,0.000000,0.313131,0.434343,0.191919,0.040404,...,0.000000,0.020,0.003003,0.020,0.008272,0.018738,0.639828,1.0,0.000000,0.016393
95409,0.000014,0.608247,0.666667,1.000000,0.000000,0.000000,0.181818,0.464646,0.202020,0.070707,...,0.097561,0.003,0.001001,0.010,0.002757,0.007009,0.988852,1.0,0.666667,0.540984
95410,0.000000,0.587629,1.000000,1.000000,0.000000,0.000000,0.282828,0.353535,0.202020,0.090909,...,0.439024,0.005,0.003203,0.018,0.003676,0.010875,0.024466,1.0,1.000000,0.163934


In [19]:
X = numerical_scaled
y = targets['TARGET_B']

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
kbest = SelectKBest(chi2, k=50).fit_transform(X, y)  #k=50 means that we are choosing 50 features

selected = pd.DataFrame(kbest)
selected.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.666667,0.010101,0.079833,0.105833,0.230769,0.153846,0.070707,0.0,0.0,0.0,...,0.464646,0.0,0.285714,0.433333,0.366492,0.127119,0.341463,0.498045,1.0,0.622951
1,0.833333,0.0,0.911333,0.869667,0.923077,0.769231,0.090909,0.0,0.0,0.0,...,0.0,0.0,0.952381,0.183333,0.146597,0.008475,0.02439,0.77451,0.333333,0.0
2,0.333333,0.020202,0.082833,0.091,0.153846,0.076923,0.181818,0.0,0.010101,0.0,...,0.313131,0.0,0.428571,0.416667,0.308901,0.110169,0.341463,0.078617,1.0,0.967213
3,0.0,0.0,0.166667,0.2105,0.153846,0.076923,0.060606,0.0,0.0,0.0,...,0.555556,0.0,0.761905,0.433333,0.324607,0.063559,0.170732,0.899764,1.0,0.655738
4,0.333333,0.989899,0.096,0.099,0.307692,0.230769,0.0,0.381818,0.585859,0.191919,...,0.676768,0.0,0.285714,0.7,0.570681,0.152542,0.195122,0.037079,0.333333,0.409836


In [20]:
model = SelectKBest(chi2, k=50).fit(X, y)  #fit x and y
#New DataFrame with score of each feature and column names
df = pd.DataFrame(data = model.scores_, columns = ['score'])
df['Column'] = numerical_scaled.columns
cols = df.sort_values(by = ['score'], ascending = False).head(50)['Column']
cols

313      RFA_2F
305    CARDGIFT
110        HVP1
111        HVP2
112        HVP3
115        HVP6
113        HVP4
22         ETH2
129         RP1
130         RP2
304    NGIFTALL
83          HV1
84          HV2
108       ETHC5
107       ETHC4
299    CARDPROM
300     NUMPROM
131         RP3
114        HVP5
314    CLUSTER2
2        INCOME
86          HV4
85          HV3
150        IC15
133         MSA
158        IC23
311    CONTROLN
162       HHAS4
161       HHAS3
279         HC6
266       POBC2
149        IC14
295       MHUC1
281         HC8
233         EC7
156        IC21
137         IC2
141         IC6
91          HU5
168        TPE3
139         IC4
228         EC2
234         EC8
109       ETHC6
249         VC4
278         HC5
169        TPE4
146        IC11
280         HC7
282         HC9
Name: Column, dtype: object

In [21]:
# Check multicollinearity
#Concat our numerical_scaled with the TARGET_B
numerical_data = pd.concat([numerical_scaled[cols], targets['TARGET_B']], axis=1)
corr = numerical_data.corr()
corr
best_corr= corr.sort_values(by = 'TARGET_B', ascending=False) #sort the values from best to worst 
best_corr_list = best_corr['TARGET_B']
best_corr_list

TARGET_B    1.000000
RFA_2F      0.072311
CARDGIFT    0.054027
NGIFTALL    0.050896
NUMPROM     0.033161
CARDPROM    0.032467
HV2         0.025607
HV1         0.025012
HVP4        0.024186
HVP3        0.024175
HVP2        0.023514
HVP1        0.022948
HVP5        0.022034
IC4         0.020535
HHAS3       0.020252
HV3         0.020191
HV4         0.020085
RP2         0.019928
HVP6        0.019869
IC2         0.019828
RP1         0.019158
MHUC1       0.018195
RP3         0.017939
EC7         0.017818
IC11        0.016961
INCOME      0.016508
EC8         0.016443
IC14        0.014884
IC23        0.014821
IC21        0.013339
CONTROLN    0.013165
MSA         0.011018
HC6         0.010803
HC7         0.010363
HC9         0.008515
HC5         0.008399
HU5         0.007898
HC8        -0.009124
TPE3       -0.012178
TPE4       -0.012396
VC4        -0.013304
ETHC6      -0.013369
IC6        -0.014362
EC2        -0.015423
CLUSTER2   -0.016390
IC15       -0.017081
HHAS4      -0.017448
POBC2      -0

In [22]:
# Heatmap of the top 50 features
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(25, 25))
correlation_matrix = best_corr.corr()
#sns.heatmap(correlation_matrix, annot=True, fmt=".2f")
#plt.show()

<Figure size 2500x2500 with 0 Axes>

In [24]:
columns_to_keep = ['RFA_2F', 'CARDGIFT', 'NGIFTALL', 'NUMPROM', 'CARDPROM', 'HV2', 'HVP4', 'IC4', 'HHAS3', 'RP1',
                'MHUC1', 'EC7', 'INCOME', 'CONTROLN', 'MSA', 'HC6', 'HU5', 'TPE3', 'ETHC6','CLUSTER2', 'POBC2', 'ETHC4']

numerical_scaled_df = transformer.transform(numerical)
numerical_scaled_df = pd.DataFrame(numerical_scaled_df, columns = numerical.columns)

numerical_scaled_feature = numerical_scaled_df[columns_to_keep]
numerical_scaled_feature.head()

Unnamed: 0,RFA_2F,CARDGIFT,NGIFTALL,NUMPROM,CARDPROM,HV2,HVP4,IC4,HHAS3,RP1,...,INCOME,CONTROLN,MSA,HC6,HU5,TPE3,ETHC6,CLUSTER2,POBC2,ETHC4
0,1.0,0.341463,0.127119,0.366492,0.433333,0.105833,0.272727,0.252,0.515152,0.020202,...,0.666667,0.498045,0.0,0.313131,0.070707,0.0,0.0,0.622951,0.747475,0.0
1,0.333333,0.02439,0.008475,0.146597,0.183333,0.869667,0.989899,0.691333,0.666667,0.919192,...,0.833333,0.77451,0.478632,0.979798,0.090909,0.0,0.0,0.0,0.393939,0.0
2,1.0,0.341463,0.110169,0.308901,0.416667,0.091,0.181818,0.226667,0.313131,0.0,...,0.333333,0.078617,0.0,0.505051,0.181818,0.0,0.0,0.967213,0.848485,0.0
3,1.0,0.170732,0.063559,0.324607,0.433333,0.2105,0.69697,0.282,0.262626,0.010101,...,0.0,0.899764,0.997863,0.393939,0.060606,0.0,0.0,0.655738,0.676768,0.0
4,0.333333,0.195122,0.152542,0.570681,0.7,0.099,0.161616,0.214,0.111111,0.252525,...,0.333333,0.037079,0.534188,0.060606,0.0,0.010101,0.191919,0.409836,0.656566,0.381818


In [26]:
print(numerical_scaled_feature.shape)
print(categorical.shape)

(95412, 22)
(95412, 22)


In [29]:
#Concat our numerical with our selected features, the categorical and TARGET_B
new_df_selected_features = pd.concat([numerical_scaled_feature, categorical, targets['TARGET_B']], axis=1).reset_index(drop=True)
new_df_selected_features

Unnamed: 0,RFA_2F,CARDGIFT,NGIFTALL,NUMPROM,CARDPROM,HV2,HVP4,IC4,HHAS3,RP1,...,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B
0,1.000000,0.341463,0.127119,0.366492,0.433333,0.105833,0.272727,0.252000,0.515152,0.020202,...,12,92,8,94,2,95,12,89,11,0
1,0.333333,0.024390,0.008475,0.146597,0.183333,0.869667,0.989899,0.691333,0.666667,0.919192,...,2,93,10,95,12,95,12,93,10,0
2,1.000000,0.341463,0.110169,0.308901,0.416667,0.091000,0.181818,0.226667,0.313131,0.000000,...,2,91,11,92,7,95,12,90,1,0
3,1.000000,0.170732,0.063559,0.324607,0.433333,0.210500,0.696970,0.282000,0.262626,0.010101,...,1,87,11,94,11,95,12,87,2,0
4,0.333333,0.195122,0.152542,0.570681,0.700000,0.099000,0.161616,0.214000,0.111111,0.252525,...,1,93,10,96,1,96,1,79,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.000000,0.000000,0.000000,0.052356,0.083333,0.170833,0.737374,0.356667,0.707071,0.696970,...,2,96,2,96,2,96,2,96,2,0
95408,0.000000,0.000000,0.000000,0.031414,0.050000,0.287167,0.919192,0.566000,0.616162,0.161616,...,1,96,3,96,3,96,3,96,3,0
95409,0.666667,0.097561,0.025424,0.151832,0.216667,0.062833,0.010101,0.230000,0.262626,0.030303,...,1,96,3,95,1,96,10,94,10,0
95410,1.000000,0.439024,0.169492,0.643979,0.583333,0.409833,1.000000,0.370667,0.464646,0.888889,...,5,90,11,96,8,97,1,86,12,1


In [32]:
# X, y split using the new data with less numerical features
y_features = new_df_selected_features['TARGET_B']
X_features = new_df_selected_features.drop(['TARGET_B'], axis = 1)

In [33]:
#Checking the shape
X_features.shape
y_features.shape

(95412,)

In [34]:
# Getting my upsampled data (numerical features selected and scaled + categorical encoded) cooncatenated
new_train_data_oversampled=pd.concat([X_train_treated_upsampled[numerical_scaled_feature.columns],X_train_treated_upsampled[encoded_categorical_train.columns]], axis=1 )
# checking the shape
new_train_data_oversampled.shape

(144972, 46)

In [36]:
# Concat the test data (numerical features selected and scaled + categorical encoded)
X_test_new = pd.concat([X_test_treated[numerical_scaled_feature.columns], X_test_treated[encoded_categorical_test.columns]], axis=1)
X_test_new

Unnamed: 0,RFA_2F,CARDGIFT,NGIFTALL,NUMPROM,CARDPROM,HV2,HVP4,IC4,HHAS3,RP1,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.000000,0.121951,0.029661,0.256545,0.350000,0.359167,0.949495,0.376000,0.525253,0.888889,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.243902,0.059322,0.335079,0.483333,0.044333,0.010101,0.218667,0.121212,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.000000,0.121951,0.029661,0.282723,0.366667,0.386500,0.989899,0.397333,0.545455,0.979798,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.333333,0.292683,0.080508,0.329843,0.466667,0.178500,0.747475,0.366667,0.636364,0.323232,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.000000,0.024390,0.000000,0.052356,0.083333,0.061833,0.070707,0.201333,0.494949,0.121212,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.666667,0.000000,0.021186,0.167539,0.216667,0.088333,0.151515,0.253333,0.505051,0.010101,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
19079,0.333333,0.073171,0.021186,0.261780,0.333333,0.107333,0.373737,0.307333,0.646465,0.000000,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
19080,0.000000,0.000000,0.000000,0.057592,0.083333,0.070333,0.131313,0.200667,0.232323,0.000000,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
19081,0.000000,0.390244,0.110169,0.356021,0.516667,0.094333,0.202020,0.216000,0.292929,0.000000,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [37]:
# checking the shape of my y_train upsampled to see if matches to be able to use it in training my model
y_train_upsampled.shape

(144972,)

In [38]:
# Training the model with train oversampled
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(new_train_data_oversampled, y_train_upsampled)
print(clf.score(new_train_data_oversampled, y_train_upsampled))
print(clf.score(X_test_new, y_test))
y_pred = clf.predict(X_test_new)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6082208978285462
0.5600796520463239


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[10101,  7982],
       [  413,   587]], dtype=int64)

In [39]:
#We need to drop the duplicates from the upsampled train data
new_train_data_oversampled_2 = new_train_data_oversampled.drop_duplicates(keep='last')
new_train_data_oversampled_2

Unnamed: 0,RFA_2F,CARDGIFT,NGIFTALL,NUMPROM,CARDPROM,HV2,HVP4,IC4,HHAS3,RP1,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
63735,0.666667,0.146341,0.029661,0.272251,0.333333,0.187833,0.838384,0.342000,0.585859,0.252525,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
43575,0.000000,0.000000,0.012712,0.172775,0.266667,0.157500,0.797980,0.286667,0.393939,0.525253,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
45601,0.000000,0.000000,0.000000,0.062827,0.083333,0.586000,1.000000,0.765333,0.636364,0.202020,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
70729,0.000000,0.024390,0.000000,0.057592,0.083333,0.151667,0.767677,0.260667,0.343434,0.444444,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
19212,0.000000,0.048780,0.004237,0.099476,0.133333,0.166667,0.676768,0.340667,0.464646,0.161616,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2234,0.000000,0.024390,0.004237,0.089005,0.133333,0.069500,0.101010,0.123333,0.101010,0.000000,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
50485,1.000000,0.146341,0.059322,0.256545,0.316667,0.249500,0.898990,0.346000,0.303030,0.141414,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
7245,0.666667,0.268293,0.080508,0.371728,0.483333,0.209500,0.919192,0.233333,0.363636,0.080808,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
11487,0.666667,0.414634,0.114407,0.371728,0.500000,0.241833,1.000000,0.369333,0.444444,0.010101,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
all_data_treated = pd.concat([new_train_data_oversampled_2,X_test_new], axis =0)
all_data_treated

Unnamed: 0,RFA_2F,CARDGIFT,NGIFTALL,NUMPROM,CARDPROM,HV2,HVP4,IC4,HHAS3,RP1,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
63735,0.666667,0.146341,0.029661,0.272251,0.333333,0.187833,0.838384,0.342000,0.585859,0.252525,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
43575,0.000000,0.000000,0.012712,0.172775,0.266667,0.157500,0.797980,0.286667,0.393939,0.525253,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
45601,0.000000,0.000000,0.000000,0.062827,0.083333,0.586000,1.000000,0.765333,0.636364,0.202020,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
70729,0.000000,0.024390,0.000000,0.057592,0.083333,0.151667,0.767677,0.260667,0.343434,0.444444,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
19212,0.000000,0.048780,0.004237,0.099476,0.133333,0.166667,0.676768,0.340667,0.464646,0.161616,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.666667,0.000000,0.021186,0.167539,0.216667,0.088333,0.151515,0.253333,0.505051,0.010101,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
19079,0.333333,0.073171,0.021186,0.261780,0.333333,0.107333,0.373737,0.307333,0.646465,0.000000,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
19080,0.000000,0.000000,0.000000,0.057592,0.083333,0.070333,0.131313,0.200667,0.232323,0.000000,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
19081,0.000000,0.390244,0.110169,0.356021,0.516667,0.094333,0.202020,0.216000,0.292929,0.000000,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [41]:
#Making the predictions
y_pred = clf.predict(all_data_treated)
y_pred

array([1, 0, 0, ..., 0, 1, 1], dtype=int64)

In [42]:
# Adding the new PREDICTIONS columns to our dataframe
all_data_treated['PREDICTIONS'] = y_pred
all_data_treated

Unnamed: 0,RFA_2F,CARDGIFT,NGIFTALL,NUMPROM,CARDPROM,HV2,HVP4,IC4,HHAS3,RP1,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,PREDICTIONS
63735,0.666667,0.146341,0.029661,0.272251,0.333333,0.187833,0.838384,0.342000,0.585859,0.252525,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
43575,0.000000,0.000000,0.012712,0.172775,0.266667,0.157500,0.797980,0.286667,0.393939,0.525253,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
45601,0.000000,0.000000,0.000000,0.062827,0.083333,0.586000,1.000000,0.765333,0.636364,0.202020,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
70729,0.000000,0.024390,0.000000,0.057592,0.083333,0.151667,0.767677,0.260667,0.343434,0.444444,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
19212,0.000000,0.048780,0.004237,0.099476,0.133333,0.166667,0.676768,0.340667,0.464646,0.161616,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.666667,0.000000,0.021186,0.167539,0.216667,0.088333,0.151515,0.253333,0.505051,0.010101,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
19079,0.333333,0.073171,0.021186,0.261780,0.333333,0.107333,0.373737,0.307333,0.646465,0.000000,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
19080,0.000000,0.000000,0.000000,0.057592,0.083333,0.070333,0.131313,0.200667,0.232323,0.000000,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
19081,0.000000,0.390244,0.110169,0.356021,0.516667,0.094333,0.202020,0.216000,0.292929,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1


# Lab | Final regression model in "Health Care for All" Case

Instructions

At this point, we have created a model to predict who will make a donation and who won't (Classification Model). But, what about the ammount of money that each person will give?

In this lab, subset those that have made a donation (Target B) and use that subset to create a model to predict how much money will they give (Target D) (Regression Model).
-Only look at people who have donated (Target B = 1)

-Use this new dataframe to create a model to predict how much they will donate (Target D)

-Using the regression model, make predictions on all of the people our classification model predicted will donate.

-See the pdf file for a schema of the process.

Evaluate the result of your model and estimate how much better the result are for the business in comparison with the naive scenario we discuss on Monday. (Just sending donation cards to everyone)

At this point, we have created a model to predict who will make a donation and who won't (Classification Model). But, what about the ammount of money that each person will give?

In this lab, subset those that have made a donation (Target B) and use that subset to create a model to predict how much money will they give (Target D) (Regression Model).

Only look at people who have donated (Target B = 1)

Use this new dataframe to create a model to predict how much they will donate (Target D)

Using the regression model, make predictions on all of the people our classification model predicted will donate.

See the pdf file for a schema of the process.

Evaluate the result of your model and estimate how much better the result are for the business in comparison with the naive scenario we discuss on Monday. (Just sending donation cards to everyone)

In [None]:
# save it to the dataframe the main one with the pred columns
# load all from csv = target B= 1
# feature selection first but TARGET B =1
# x/y split
# train/test split
# Splitting data in categorical and numerical
# scalling
# encoding
# concatenating
# run model trainig set
# model
# load the last dataframe saved 'whole_data_treated'
# match the columns of both
# and then predict again

### Regression Model for TARGET_D

In [43]:
#Loading data from the csv
numerical_df2 = pd.read_csv('numerical.csv')
categorical_df2 = pd.read_csv('categorical.csv')
targets_df2 = pd.read_csv('target.csv')
data2 = pd.concat([numerical_df2, categorical_df2, targets_df2], axis = 1)

In [44]:
donors= data2[data2['TARGET_B'] == 1]
donors.shape

(4843, 339)

In [46]:
#Scale numericals
scaler = MinMaxScaler()
numerical_data = donors[numerical_df2.columns]
numerical_data_scaled = scaler.fit_transform(numerical_data)
numerical_data_scaled.shape

(4843, 315)

In [51]:
#Using RFE for feature selection
#We cant use k-best for feature selections because its only use for classification and not regression.
X_reg = numerical_data_scaled
y_reg = donors['TARGET_D']


from sklearn.feature_selection import RFE
from sklearn import linear_model
lm = linear_model.LinearRegression()
rfe = RFE(lm, n_features_to_select=22, verbose=False)  #selecting 22 features
rfe.fit(X_reg, y_reg)

In [52]:
rfe.ranking_

array([165, 203, 280, 268,  79,  83, 292, 232, 128, 127, 228,  82, 294,
        23, 105,  24,   1,   1,   1,  26,  25,   1,  53, 107, 108,   1,
       109, 258, 188,  49, 230,  65, 259,   1,   1,   1,   1,  80,  51,
        52,  50,  34, 169, 177, 117, 255, 124, 119, 120,  75, 270, 269,
       271,  37, 116, 118, 195, 115, 267,  44,  84,  45, 178,   7,   9,
        43,  42,  41,  29,  28,  30,  27, 110, 184,  20, 288, 180,  19,
       112, 113,  17,  18,  16, 207, 251, 204, 290,   3,   2, 209, 210,
       220,   1,   8,   4,   1,  35,  36,   1,  92,  91,   5,   6,  10,
         1,   1,   1,  69,  54,  55, 278, 211, 245, 244, 243, 252,  60,
        88, 208,  87, 263, 194, 114, 196,  21,  11,  13,  12,  14, 206,
       205, 179, 218, 284, 227,  85,  38,  76,  93, 102,  31,  97,  99,
        98, 100,  94, 201,  95, 283,  32, 154, 239, 174,  96, 103, 101,
       265, 158,  33, 152, 233, 213, 153,   1,   1, 222, 139, 155, 225,
       223, 224, 226, 106, 156, 138, 276, 282, 272, 129, 130, 17

In [53]:
#The most importante features ar ranked as 1
df_reg = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df_reg['Column_name'] = numerical_df2.columns
df_reg

print(df_reg.sort_values(by = ['Rank'], ascending = True).head(50))

     Rank Column_name
92      1        HHD1
303     1    RAMNTALL
33      1       ETH13
18      1     POP90C3
17      1     POP90C2
16      1     POP90C1
34      1       ETH14
35      1       ETH15
36      1       ETH16
304     1    NGIFTALL
21      1        ETH1
25      1        ETH5
105     1       ETHC2
163     1         MC1
306     1    MINRAMNT
104     1       ETHC1
164     1         MC2
98      1        HHD7
308     1    LASTGIFT
95      1        HHD4
310     1     AVGGIFT
106     1       ETHC3
88      2         HU2
87      3         HU1
94      4        HHD3
101     5       HHD10
102     6       HHD11
63      7        HHN2
93      8        HHD2
64      9        HHN3
103    10       HHD12
125    11       HUPA4
127    12       HUPA6
126    13       HUPA5
128    14       HUPA7
307    15    MAXRAMNT
82     16         DW9
80     17         DW7
81     18         DW8
77     19         DW4
74     20         DW1
124    21       HUPA3
293    22        HC20
13     23      POP901
15     24 