# Lab | Final regression model in "Health Care for All" Case

In [1]:
# Instructions
# At this point, we have created a model to predict who will make a donation and who won't. 
# But, what about the ammount of money that each person will give? 
# In this lab, subset those that made a donation and use that subset to create a model to predict how much money will they give.

# Evaluate the result of your model and estimate how much better the result are for the bussiness 
# in comparison with the naive scenario we discuss on Monday.

In [47]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import math
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [3]:
# 5. Split the data into numerical and catagorical.
numerical = pd.read_csv('numerical.csv')
numerical = numerical.drop(['Unnamed: 0'], axis=1)
categorical = pd.read_csv('categorical.csv')
categorical = categorical.drop(['Unnamed: 0'], axis=1)
print(numerical.shape)
print(categorical.shape)

(95412, 321)
(95412, 10)


In [4]:
numerical.head()

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,8901,0,3712,60.0,0.0,9.0,0,0,39,34,...,9402,10.0,9512,8911,4.0,7.741935,95515,0,4,39.0
1,9401,1,5202,46.0,6.0,9.0,16,0,15,55,...,9512,25.0,9512,9310,18.0,15.666667,148535,0,2,1.0
2,9001,1,0,61.611649,3.0,1.0,2,0,20,29,...,9207,5.0,9512,9001,12.0,7.481481,15078,1,4,60.0
3,8701,0,2801,70.0,1.0,4.0,2,0,23,14,...,9411,10.0,9512,8702,9.0,6.8125,172556,1,4,41.0
4,8601,0,2001,78.0,3.0,2.0,60,1,28,9,...,9601,15.0,9601,7903,14.0,6.864865,7112,1,2,26.0


In [5]:
categorical.head()

Unnamed: 0,STATE,ZIP,CLUSTER,HOMEOWNR,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B
0,IL,61081,36,40,3,L,E,C,T,2
1,CA,91326,14,H,3,L,G,A,S,1
2,NC,27017,43,U,3,L,E,C,R,2
3,CA,95953,44,U,3,L,E,C,R,2
4,FL,33176,16,H,3,L,F,A,S,2


In [6]:
categorical = categorical.drop(['ZIP', 'CLUSTER', 'DOMAIN_B'], axis=1)

In [7]:
categorical['STATE'].value_counts()

OTHER    30457
CA       17343
FL        8376
TX        7535
IL        6420
MI        5654
NC        4160
WA        3577
GA        3403
IN        2980
WI        2795
MO        2712
Name: STATE, dtype: int64

In [8]:
categorical['DATASRCE'].value_counts()

3    64829
2    23455
1     7128
Name: DATASRCE, dtype: int64

In [9]:
#  Decide if any columns need their dtype changed.

In [10]:
numerical.dtypes

ODATEDW       int64
TCODE         int64
DOB           int64
AGE         float64
INCOME      float64
             ...   
AVGGIFT     float64
CONTROLN      int64
HPHONE_D      int64
RFA_2F        int64
CLUSTER2    float64
Length: 321, dtype: object

In [11]:
categorical.dtypes

STATE       object
HOMEOWNR    object
DATASRCE     int64
RFA_2R      object
RFA_2A      object
GEOCODE2    object
DOMAIN_A    object
dtype: object

In [12]:
categorical['DATASRCE'] = categorical['DATASRCE'].astype(object)

In [13]:
categorical.dtypes

STATE       object
HOMEOWNR    object
DATASRCE    object
RFA_2R      object
RFA_2A      object
GEOCODE2    object
DOMAIN_A    object
dtype: object

In [14]:
# Scale the numerical features
def standard_scaler(df):
    columns = df.columns
    df_copy = pd.DataFrame(StandardScaler().fit(df).transform(df))
    df_copy.columns = columns
    return df_copy

numerical_scaled = standard_scaler(numerical)
numerical_scaled

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,-0.699843,-0.056847,0.463551,-1.115292e-01,-1.312604,0.717411,-0.356881,-0.206977,0.745798,0.284659,...,-0.230581,-0.523992,-0.733220,-0.701177,-0.525458,-0.520509,-0.004760,-1.001238,1.948226,0.398135
1,0.755960,-0.055799,1.162350,-1.080356e+00,1.295872,0.717411,1.362283,-0.206977,-1.346527,1.675602,...,0.405737,0.550771,-0.733220,0.544171,1.275844,0.215310,0.954282,-1.001238,0.083847,-1.628365
2,-0.408683,-0.055799,-1.277349,-4.917093e-16,-0.008366,-2.239375,-0.141985,-0.206977,-0.910626,-0.046518,...,-1.358599,-0.882247,-0.733220,-0.420271,0.503857,-0.544692,-1.459730,0.998764,1.948226,1.518043
3,-1.282165,-0.056847,0.036299,5.804901e-01,-0.877858,-1.130581,-0.141985,-0.206977,-0.649086,-1.040049,...,-0.178519,-0.523992,-0.733220,-1.353502,0.117864,-0.606808,1.388782,0.998764,1.948226,0.504793
4,-1.573325,-0.056847,-0.338895,1.134106e+00,-0.008366,-1.869777,6.089983,-0.009563,-0.213185,-1.371225,...,0.920576,-0.165738,1.074423,-3.847319,0.761186,-0.601946,-1.603822,0.998764,0.083847,-0.295141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1.338282,-0.055799,-1.277349,-4.917093e-16,-1.312604,0.717411,-0.356881,2.556819,0.484257,1.145719,...,0.926361,0.550771,1.094733,1.455554,-0.010800,1.081920,1.606058,-1.001238,-0.848342,-1.041747
95408,1.338282,-0.055799,1.068082,-9.419524e-01,1.730618,0.717411,-0.249433,-0.206977,0.048356,0.880777,...,0.932145,0.192517,1.115044,1.458675,-0.010800,0.617665,0.487079,0.998764,-0.848342,-1.575036
95409,1.047121,-0.055799,0.505291,-1.115292e-01,-1.312604,0.717411,-0.356881,-0.206977,-1.084987,1.079483,...,0.342105,-0.523992,1.257218,0.856289,-0.654122,-0.470019,1.697820,0.998764,1.016037,0.131490
95410,-1.573325,-0.056847,0.600966,-2.499331e-01,1.730618,0.717411,-0.356881,-0.206977,-0.213185,0.350894,...,0.961069,0.049215,3.105481,-1.634408,-0.525458,-0.111555,-1.647577,0.998764,1.948226,-1.095076


In [15]:
# Encode the categorical variables
columns_to_encode = list(categorical.columns)
categorical_encoded = pd.get_dummies(categorical[columns_to_encode])
categorical_encoded

  uniques = Index(uniques)


Unnamed: 0,STATE_CA,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_OTHER,STATE_TX,...,RFA_2A_G,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,0
95408,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
95409,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
95410,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [16]:
# Concatenate the different dataframes and apply a filter, so we only get the data for the different donations
y = pd.read_csv('target.csv')
y = y.drop(['Unnamed: 0', 'TARGET_B'], axis=1)
data = pd.concat([numerical_scaled, categorical_encoded, y], axis=1)
df = data[data['TARGET_D'] > 0]

In [17]:
df

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_D
20,-1.282165,-0.054750,0.411493,2.687462e-02,-0.008366,0.347813,0.717596,0.187851,-0.474725,0.682071,...,1,0,0,0,0,0,1,0,0,4.0
30,-0.408683,-0.056847,-1.277349,-4.917093e-16,-1.312604,0.717411,-0.356881,-0.009563,0.571437,1.874308,...,1,0,0,0,0,0,0,1,0,7.0
45,-1.573325,-0.056847,0.181217,3.036823e-01,0.861126,0.717411,0.180358,-0.206977,0.222716,-0.377695,...,0,0,1,0,1,0,0,0,0,5.0
78,-0.408683,-0.056847,0.039113,5.112881e-01,1.295872,0.717411,-0.356881,-0.206977,0.309897,-0.642636,...,1,0,0,0,0,0,1,0,0,13.0
93,0.173639,-0.055799,-0.147077,7.880959e-01,-0.877858,-0.021786,0.717596,-0.206977,-0.823446,1.543131,...,1,0,0,0,0,0,1,0,0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,-1.573325,-0.054750,1.210187,-1.149558e+00,-1.312604,0.717411,-0.356881,-0.206977,1.268879,-0.112753,...,1,0,0,0,0,0,0,1,0,20.0
95309,0.755960,-0.056847,0.927384,-7.343466e-01,0.861126,-0.391384,-0.249433,-0.009563,0.135536,0.880777,...,0,1,0,0,0,0,1,0,0,15.0
95398,-1.573325,-0.056847,-0.756768,1.687721e+00,-1.312604,0.717411,-0.356881,-0.009563,0.135536,-0.576401,...,0,1,0,0,0,0,1,0,0,3.0
95403,-0.408683,-0.056847,0.599090,-2.499331e-01,0.426380,0.717411,-0.356881,-0.206977,-0.561905,1.079483,...,0,0,0,1,0,1,0,0,0,10.0


In [18]:
y_final = df['TARGET_D']
X_final = df.drop(['TARGET_D'], axis=1)

In [19]:
y_final

20        4.0
30        7.0
45        5.0
78       13.0
93       10.0
         ... 
95298    20.0
95309    15.0
95398     3.0
95403    10.0
95410    18.0
Name: TARGET_D, Length: 4843, dtype: float64

In [20]:
X_final

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,RFA_2A_G,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
20,-1.282165,-0.054750,0.411493,2.687462e-02,-0.008366,0.347813,0.717596,0.187851,-0.474725,0.682071,...,0,1,0,0,0,0,0,1,0,0
30,-0.408683,-0.056847,-1.277349,-4.917093e-16,-1.312604,0.717411,-0.356881,-0.009563,0.571437,1.874308,...,0,1,0,0,0,0,0,0,1,0
45,-1.573325,-0.056847,0.181217,3.036823e-01,0.861126,0.717411,0.180358,-0.206977,0.222716,-0.377695,...,0,0,0,1,0,1,0,0,0,0
78,-0.408683,-0.056847,0.039113,5.112881e-01,1.295872,0.717411,-0.356881,-0.206977,0.309897,-0.642636,...,0,1,0,0,0,0,0,1,0,0
93,0.173639,-0.055799,-0.147077,7.880959e-01,-0.877858,-0.021786,0.717596,-0.206977,-0.823446,1.543131,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,-1.573325,-0.054750,1.210187,-1.149558e+00,-1.312604,0.717411,-0.356881,-0.206977,1.268879,-0.112753,...,0,1,0,0,0,0,0,0,1,0
95309,0.755960,-0.056847,0.927384,-7.343466e-01,0.861126,-0.391384,-0.249433,-0.009563,0.135536,0.880777,...,0,0,1,0,0,0,0,1,0,0
95398,-1.573325,-0.056847,-0.756768,1.687721e+00,-1.312604,0.717411,-0.356881,-0.009563,0.135536,-0.576401,...,1,0,1,0,0,0,0,1,0,0
95403,-0.408683,-0.056847,0.599090,-2.499331e-01,0.426380,0.717411,-0.356881,-0.206977,-0.561905,1.079483,...,0,0,0,0,1,0,1,0,0,0


In [35]:
# Split the data into a training set and a test set.

X_train, X_test, y_train, y_test = train_test_split(X_final,y_final, test_size=0.2, random_state=100)

In [49]:
# Get scores for different Regression Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score

model1 = LinearRegression()
model2 = RandomForestRegressor()
model3 = KNeighborsRegressor()

model_pipeline = [model1, model2, model3]
model_names = ['Linear Regression', 'Random Forest', 'K-Nearest Neighbors']

score = {}

for i, model in enumerate(model_pipeline):
    mean = cross_val_score(model, X_train, y_train, cv=10).mean()
    score[model_names[i]] = mean

print(score)

{'Linear Regression': 0.23800953705252711, 'Random Forest': 0.4752410477379538, 'K-Nearest Neighbors': 0.07038454905277426}


In [41]:
# FitRandom Forest Regressor model 
model2 = RandomForestRegressor(n_estimators=100,random_state=0)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (Random Forest Regressor):", mse)
print("R2 Score (Random Forest Regressor):", r2)

Mean Squared Error (Random Forest Regressor): 115.79570619129
R2 Score (Random Forest Regressor): 0.43362746387033835


In [44]:
# Hyperparameter tunning (Grid Search)
n_estimators = [200, 500, 1000, 2000, 4000]
min_samples_split = [2, 4, 8, 16, 32]
min_samples_leaf = [1, 2, 3, 4, 5]
max_features = ['sqrt', 'log2']
max_samples = ['None', 0.5, 0.8]

param_grid = {
    'n_estimators': [50,100],
    'min_samples_split': [2,4],
    'min_samples_leaf':[1,2]   
}

grid_search = GridSearchCV(model2,param_grid,cv=5,return_train_score=True)
grid_search.fit(X_train,y_train)
grid_search.best_params_

{'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

In [51]:
# Re-Fit Random Forest Regressor model using hyperparameters
model2 = RandomForestRegressor(n_estimators=100, min_samples_leaf= 2, min_samples_split=2 ,random_state=0)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (Random Forest Regressor):", mse)
print("R2 Score (Random Forest Regressor):", r2)

Mean Squared Error (Random Forest Regressor): 108.60480338978167
R2 Score (Random Forest Regressor): 0.46879914674797607


In [45]:
# Feature Importance
model2.fit(X_train, y_train)
X_train.head()
feature_names = X_train.columns
feature_names = list(feature_names)
df = pd.DataFrame(list(zip(feature_names, model2.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
sorted = df.sort_values(by=['score_feature_importance'], ascending = False)
sorted.head(10)

Unnamed: 0,columns_name,score_feature_importance
312,LASTGIFT,0.416051
316,AVGGIFT,0.106174
289,HC14,0.016548
310,MAXRAMNT,0.013718
317,CONTROLN,0.012292
304,NUMPRM12,0.008499
107,ETHC2,0.008451
226,OEDC5,0.00812
323,STATE_GA,0.006788
257,ANC6,0.006303
