# Lab | Final regression model in "Health Care for All" Case

In [1]:
# importing libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

In [2]:
# datasets

categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

In [3]:
data = pd.concat([categorical, numerical, target], axis=1)

In [4]:
print(data.shape)
data.head()

(95412, 339)


Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D
0,IL,36,H,F,3,L,E,C,T,2,...,12.0,10.0,4,7.741935,95515,0,4,39,0,0.0
1,CA,14,H,M,3,L,G,A,S,1,...,25.0,25.0,18,15.666667,148535,0,2,1,0,0.0
2,NC,43,U,M,3,L,E,C,R,2,...,16.0,5.0,12,7.481481,15078,1,4,60,0,0.0
3,CA,44,U,F,3,L,E,C,R,2,...,11.0,10.0,9,6.8125,172556,1,4,41,0,0.0
4,FL,16,H,F,3,L,F,A,S,2,...,15.0,15.0,14,6.864865,7112,1,2,26,0,0.0


In [5]:
# subsetting the data for those who have donated

donors = data[data['TARGET_B'] == 1]

In [6]:
# Train-test split

X = donors.drop(['TARGET_B', 'TARGET_D'], axis=1)
y = donors['TARGET_D']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
numerical_features = X.select_dtypes(include=['int', 'float']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [8]:
# separating numerical and categorical features

X_train_num = X_train[numerical_features]
X_train_cat = X_train[categorical_features]

X_test_num = X_test[numerical_features]
X_test_cat = X_test[categorical_features]

In [9]:
# standardizing numerical features

scaler = StandardScaler()

X_train_num_scaled = scaler.fit_transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

In [10]:
# One-hot encoding categorical features

encoder = OneHotEncoder(drop='first', sparse=False)

X_train_cat_encoded = encoder.fit_transform(X_train_cat)
X_test_cat_encoded = encoder.transform(X_test_cat)



In [11]:
# combining processed features

X_train_processed = np.hstack((X_train_num_scaled, X_train_cat_encoded))
X_test_processed = np.hstack((X_test_num_scaled, X_test_cat_encoded))

In [12]:
# regression model

regression_model = RandomForestRegressor()

regression_model.fit(X_train_processed, y_train)

In [13]:
# making predictions

test_data_num = X.drop(categorical_features, axis=1)

test_data_cat = X[categorical_features]

test_data_num_scaled = scaler.transform(test_data_num)

test_data_cat_encoded = encoder.transform(test_data_cat)

test_data_processed = np.hstack((test_data_num_scaled, test_data_cat_encoded))


donors_predictions = regression_model.predict(test_data_processed)

In [14]:
# evaluating the model

r2 = r2_score(y_test, regression_model.predict(X_test_processed))
mse = mean_squared_error(y_test, regression_model.predict(X_test_processed))
mae = mean_absolute_error(y_test, regression_model.predict(X_test_processed))
rmse = sqrt(mse)

print('R2: ', r2)
print('MSE: ', mse)
print('MAE: ', mae)
print('RMSE: ', rmse)

R2:  0.3909136613426186
MSE:  116.07033932816306
MAE:  4.868259752321981
RMSE:  10.773594540735374


In [18]:
# estimating average donation

average_donation_amount = round(np.mean(donors_predictions), 2)

print('Average donation amount: ', average_donation_amount, '$')

Average donation amount:  15.64 $


In [20]:
# calculating total predicted amount of money from donations

predicted_data = pd.read_csv('predicted_data.csv') 

total_predicted_amount = round((predicted_data['Predicted_Target_B'].sum()) * average_donation_amount, 2)

print('Total predicted donation amount: ', total_predicted_amount, '$')

Total predicted donation amount:  447476.04 $
