In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
np.set_printoptions(threshold=sys.maxsize)
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [None]:
categorical = pd.read_csv('/Users/erinberardi/Downloads/categorical.csv')
numerical = pd.read_csv('/Users/erinberardi/Downloads/numerical-5.csv')
target = pd.read_csv('/Users/erinberardi/Downloads/target.csv')

#read all data

### Separate into numerical and categorical

In [None]:
numerical

In [None]:
categorical

### Check NaN values

In [None]:
categorical.isna().sum()

In [None]:
cols_to_drop = ['OSOURCE','SOLIH','VETERANS','ZIP','Unnamed: 0']
cat = categorical.drop(cols_to_drop, axis=1)


### The data needs to be all numerical to use the K-Best after checking the Variance

In [None]:
cat

In [None]:
cat['GENDER'].value_counts(dropna=False)

In [None]:
cat["GENDER"].replace({"J": "U", "C": "U","A":"U"}, inplace=True)

In [None]:
X = pd.concat([numerical,cat],axis = 1)
X

In [None]:
full_data = pd.concat([X,target], axis=1)
full_data

In [None]:
y =target['TARGET_B']
target_d = target['TARGET_D']
target_d.value_counts()
target

# here I train test split to focus on train features encoding and scaling

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
display(X_train.shape)
display(X_test.shape)
display(y_train.shape)
display(y_test.shape) 

In [None]:
X_train_cat = X_train.select_dtypes(include = object)
X_train_num =X_train.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(include = object)
X_test_num =X_test.select_dtypes(include = np.number)

### One-Hot-Encoding

In [None]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

cols = encoder.get_feature_names(input_features=X_train_cat.columns)

X_train_cat_encode = pd.DataFrame(encoder.transform(X_train_cat).toarray(),columns=cols)

X_train_cat_encode.reset_index(drop = True, inplace = True)
X_train_cat_encode

In [None]:
cols = encoder.get_feature_names(input_features=X_test_cat.columns)

X_test_cat_encode = pd.DataFrame(encoder.transform(X_test_cat).toarray(),columns=cols)

X_test_cat_encode.reset_index(drop = True, inplace = True)
X_test_cat_encode


### Scaling train and test

In [None]:
transformer = MinMaxScaler().fit(X_train_num)
X_train_num_norm = transformer.transform(X_train_num)
print(X_train_num_norm.shape)
X_train_num_scale = pd.DataFrame(X_train_num_norm, index = X_train_num.index, columns=X_train_num.columns)
X_train_num_scale.head()
X_train_num_scale.reset_index(drop = True, inplace = True)

In [None]:
X_test_num_norm = transformer.transform(X_test_num)
print(X_test_num_norm.shape)
X_test_num_scale = pd.DataFrame(X_test_num_norm, index = X_test_num.index, columns=X_test_num.columns)
X_test_num_scale.head()
X_test_num_scale.reset_index(drop = True, inplace=True)
X_test_num_scale

### Concat numerical and categorical for train and test

In [None]:
X_train = pd.concat([X_train_num_scale,X_train_cat_encode], axis=1)
X_train.index
y_train.reset_index(drop = True, inplace = True)

In [None]:
X_test = pd.concat([X_test_num_scale,X_test_cat_encode], axis=1)
X_test
y_test.reset_index(drop = True, inplace = True)

# making full df for training set for feature selection

In [None]:
data = pd.concat([X_train,y_train],axis = 1)
data

In [None]:
from sklearn.feature_selection import VarianceThreshold
Var_threshold = 0.02
sel = VarianceThreshold(threshold=Var_threshold)
                        
sel = sel.fit(X_train)
temp = sel.transform(X_train)
temp = pd.DataFrame(temp)
print(X_train.shape)
print(temp.shape)              

In [None]:
sel.variances_ > Var_threshold
sel.get_support()
var_list = list(sel.get_support())
len(var_list)

In [None]:
len(X_train.columns)

In [None]:
len(list(zip(X_train.columns,var_list)))

In [None]:
drop_list = [col[0] for col in zip(X_train.columns,var_list) if col[1] == False]
len(drop_list) # the 1º part of the list of x_train colunmn.....if it false add a column name to the list 

## Here I can drop these columns but going to check the K-Best option to get even smaller # of features


In [None]:
X_train = X_train.drop(drop_list, axis = 1)
X_train

In [None]:
X_test = X_test.drop(drop_list, axis = 1)
X_test

# Adding in KBest

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

print(X_train.shape)

K_best = SelectKBest(chi2, k=25).fit_transform(X_train, y_train)

print(K_best.shape)
selected = pd.DataFrame(K_best)
selected.head()

In [None]:
model = SelectKBest(chi2, k=25).fit(X_train,y_train)
df = pd.DataFrame(data = model.scores_, columns=['score'])
df['column_name']= X_train.columns
display(df.shape)
df
df.sort_values(by = ['score'],ascending = False).head(50)

In [None]:
cols = df.sort_values(by = ['score'],ascending = False).head(25)['column_name']
col_list = list(cols)
col_list

In [None]:
X_train_all = X_train[col_list]
X_train_all

In [None]:
X_test_all = X_test[col_list]

In [None]:
X_test_all

In [None]:
coll_data= pd.concat([X_train_all,y_train], axis= 1)
coll_data

# Looking at multicolinearity

In [None]:
corr_matrix = coll_data.corr(method = 'pearson')
fig, ax = plt.subplots(figsize=(20,15))
ax = sns.heatmap(corr_matrix, annot = True)
plt.show()

# Plots to look at best multi-collinearity.

In [None]:
col_viz = ['HVP1','HVP2','HVP3','HVP4','HVP5','HVP6','HV1','HV2']

In [None]:
for column in coll_data[col_viz]:
    sns.histplot( x=coll_data[column])
    plt.show()

In [None]:
col_to_drop = ['HVP1','HVP2','HVP3','HVP5','HVP6','HV2']
# HVP4 I will keep as it has high correlation with the target column.  Maybe look at correlation with Target D

In [None]:
X_train = X_train_all.drop(['HVP1','HVP2','HVP3','HVP5','HVP6','HV2'], axis = 1)
X_train.shape
X_test = X_test_all.drop(['HVP1','HVP2','HVP3','HVP5','HVP6','HV2'], axis = 1)
X_test

In [None]:
X_train.columns

In [None]:
X_test.columns

# Let's run a model with the chosen features.

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
print('TrainSet = ',clf.score(X_train, y_train))
print('TestSet = ',clf.score(X_test, y_test))

In [None]:
y_train.value_counts()
# unbalanced

In [None]:
plot_confusion_matrix(clf, X_test, y_test,cmap=plt.cm.Blues)  
plt.show()

## From the confusion matrix we can see that our model only predicted NO donate.

In [None]:
for_over_sample = pd.concat([X_train,y_train], axis=1)

In [None]:
for_over_sample.shape


# Fixing imbalance

In [None]:
#balacning data - upsampling
no_donate = for_over_sample[for_over_sample['TARGET_B']==0]
yes_donate = for_over_sample[for_over_sample['TARGET_B']==1]

from sklearn.utils import resample
yes_donate_oversampled = resample(yes_donate, 
                                    replace=True,
                                    n_samples = len(no_donate),random_state=42)
display(no_donate.shape)
display(yes_donate_oversampled.shape)

In [None]:
oversampled_target = pd.concat([no_donate,yes_donate_oversampled], axis = 0)

# scrambled rows to avoid yes/no clusters
oversampled_total = oversampled_target.sample(frac=1)

X_train_oversampled = oversampled_total.drop(['TARGET_B'], axis = 1)

y_train_oversampled =oversampled_total['TARGET_B']

In [None]:
display(X_train_oversampled.shape)
display(y_train_oversampled.shape)

## Re-FIT Classifier with the oversampled train set

In [None]:
clf2 = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf2.fit(X_train_oversampled, y_train_oversampled)
print('TrainSet = ',clf.score(X_train_oversampled, y_train_oversampled))
print('TestSet = ',clf.score(X_test, y_test))

In [None]:

pred_RF = clf2.predict(X_test)

print('accuracy:', accuracy_score(y_test, pred_RF))
print("precision: ",precision_score(y_test,pred_RF))
print("recall: ",recall_score(y_test,pred_RF))
print("f1: ",f1_score(y_test,pred_RF))

### Look how the new oversampled model works on the test set

In [None]:
plot_confusion_matrix(clf2, X_test, y_test,cmap=plt.cm.Blues)  
plt.show()

# Now we have a model, let's run ALL of the data to gain insights for the business

In [None]:
X.dtypes

In [None]:
display(X)
display(y)

## Scale and encode ALL of the data for model

In [None]:
Xcat = X.select_dtypes(include=object)
Xnum = X.select_dtypes(include=np.number)
Xcat

In [None]:
encoder = OneHotEncoder(drop='first').fit(Xcat)

cols = encoder.get_feature_names(input_features=Xcat.columns)

Xcat_encode = pd.DataFrame(encoder.transform(Xcat).toarray(),columns=cols)

Xcat_encode.reset_index(drop = True, inplace = True)
Xcat_encode

In [None]:
transformer = MinMaxScaler().fit(Xnum)
Xnum_norm = transformer.transform(Xnum)
print(Xnum_norm.shape)
Xnum_scale = pd.DataFrame(Xnum_norm, index = Xnum.index, columns=Xnum.columns)
Xnum_scale.head()
Xnum_scale.reset_index(drop = True, inplace = True)

In [None]:
X = pd.concat([Xnum_scale,Xcat_encode], axis = 1)

In [None]:
X

# Keep same columns as before (top 25)

In [None]:
X = X.drop(drop_list, axis = 1)   #dropping
X.shape

In [None]:
X = X[col_list]           #columns to keep -- top 25
X.shape

In [None]:
X = X.drop(col_to_drop, axis = 1)
X.shape

## Re-running Classifier with all the data

### Run the classifier on all of the data, then look at the confusion matrix

In [None]:
##Using the upsampled classifier

print('All data = ',clf2.score(X,y))


In [None]:
plot_confusion_matrix(clf2, X,y,cmap=plt.cm.Blues)  
plt.show()

In [None]:
pred_RF = clf2.predict(X)

print('accuracy:', accuracy_score(y, pred_RF))
print("precision: ",precision_score(y,pred_RF))
print("recall: ",recall_score(y,pred_RF))
print("f1: ",f1_score(y,pred_RF))

## Finding mean of Target D to understand data Target B results

In [None]:
data_targetD = full_data[full_data['TARGET_B']==1]
data_targetD

In [None]:
len(data_targetD['TARGET_D'])

In [None]:
data_targetD['TARGET_D'].value_counts()

In [None]:
data_targetD['TARGET_D'].mean()

# Reading the results

In [None]:
plot_confusion_matrix(clf2, X,y,cmap=plt.cm.Blues)  
plt.show()

In [None]:
predict_y = clf2.predict(X)


array = confusion_matrix(y, predict_y)
Q1 = array[0][0]
Q2 = array[0][1]
Q3 = array[1][0]
Q4 = array[1][1]
print(Q1)
print(Q2)
print(Q3)
print(Q4)

In [None]:
donation_gained = Q4 * 15.6
unexpected_donation = Q3 * 15.6
spend_marketing = (Q4+Q2) * 0.68
no_return_marketing = Q2 * 0.68
overall_revenue = (donation_gained + unexpected_donation) - spend_marketing



print('Donation amount gained:',round(donation_gained,2))
print('Unexpected Donations:',round(unexpected_donation,2))
print('Amount spent (cost) on marketing:',round(spend_marketing,2))
print('Amount lost on marketing:',round(no_return_marketing,2))

print('Overall revenue (donation - cost):',round(overall_revenue,2))

In [None]:
print('In the above plot we can see that there are    ',Q3,'    people of whom the model predicts they will not donate,')
print('when they actually would donate, AKA falsely predicted non-donators.')
print('We gain a donation amount of    ',unexpected_donation,'    from this group.')
print('There are also   ', Q2 ,'    people of whom the model predicts they will donate, when they actually do not.')
print('We  lose   ', no_return_marketing,'   on mailing costs from this group.')
print('When we look at the average donation amount of someone who actually donates is around 15.62 dollars.')
print('We raise   ', overall_revenue ,'   dollars with this model.')

## Results of the model 


Zooming in on the costs to send the marketing packages: in this scenario we spend a total of 26158 dollars on marketing. Of which 24208 dollars does not yield any donation. It is suggested that mailers be sent to our predicted donors as well as people who have donated in the past.

We can clearly conclude that it's in the best interest of Healthcare 4 All to have a model in which we have the lowest amount of falsely predicted non-donators. With this model we can see that the donation amount lost is almost equal to the Unexpected Donations gained.This model would need to be improved to optimize for the Falsely Predicted non- donators who would actually donate.

# Moving on to create model to predict HOW MUCH they will donate.

# Predicting amount given among actual donators

#### Using target D dataset created earlier

In [None]:
data_targetD

#### Breaking into features and target

In [None]:
X_D = data_targetD.drop(['TARGET_D', 'TARGET_B', 'Unnamed: 0'], axis=1)
y_D = data_targetD['TARGET_D']
display(X_D)
display(y_D)

#### Train test split

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_D, y_D, test_size=0.2, random_state=42)

In [None]:
X_train_cat1 = X_train1.select_dtypes(include = object)
X_train_num1 =X_train1.select_dtypes(include = np.number)
X_test_cat1 = X_test1.select_dtypes(include = object)
X_test_num1 =X_test1.select_dtypes(include = np.number)

## Encoding

In [None]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat1)

cols1 = encoder.get_feature_names(input_features=X_train_cat1.columns)

X_train_cat_encode1 = pd.DataFrame(encoder.transform(X_train_cat1).toarray(),columns=cols1)

X_train_cat_encode1.reset_index(drop = True, inplace = True)
X_train_cat_encode1

In [None]:
cols = encoder.get_feature_names(input_features=X_test_cat1.columns)

X_test_cat_encode1 = pd.DataFrame(encoder.transform(X_test_cat1).toarray(),columns=cols)

X_test_cat_encode1.reset_index(drop = True, inplace = True)
X_test_cat_encode1

## Scaling

In [None]:
transformer = MinMaxScaler().fit(X_train_num1)
X_train_num_norm1 = transformer.transform(X_train_num1)
print(X_train_num_norm1.shape)
X_train_num_scale1 = pd.DataFrame(X_train_num_norm1, index = X_train_num1.index, columns=X_train_num1.columns)
X_train_num_scale1.head()
X_train_num_scale1.reset_index(drop = True, inplace = True)

In [None]:
X_test_num_norm1 = transformer.transform(X_test_num1)
print(X_test_num_norm1.shape)
X_test_num_scale1 = pd.DataFrame(X_test_num_norm1, index = X_test_num1.index, columns=X_test_num1.columns)
X_test_num_scale1.head()
X_test_num_scale1.reset_index(drop = True, inplace=True)
X_test_num_scale1

In [None]:
X_train1 = pd.concat([X_train_num_scale1,X_train_cat_encode1], axis=1)
X_train1.index
y_train1.reset_index(drop = True, inplace = True)

In [None]:
X_test1 = pd.concat([X_test_num_scale1,X_test_cat_encode1], axis=1)
X_test1
y_test1.reset_index(drop = True, inplace = True)

# Feature Selection

In [None]:
from sklearn.feature_selection import VarianceThreshold
Var_threshold = 0.02
sel = VarianceThreshold(threshold=Var_threshold)
                        
sel = sel.fit(X_train1)
temp = sel.transform(X_train1)
temp = pd.DataFrame(temp)
print(X_train1.shape)
print(temp.shape)
                        

In [None]:
sel.variances_ > Var_threshold
sel.get_support()
var_list = list(sel.get_support())
var_list
print(var_list.count(True))
print(var_list.count(False))

In [None]:
zipped = list(zip(X_train.columns,var_list))
len(zipped)

In [None]:
drop_list = [col[0] for col in zip(X_train1.columns,var_list) if col[1] == False]
print(drop_list)
drop_list = drop_list +['HVP1','HVP2','HVP3','HVP5','HVP6','HV2']
len(drop_list)

In [None]:
#drop_list

In [None]:
X_train1 = X_train1.drop(drop_list, axis = 1)
X_train1

In [None]:
X_test1 = X_test1.drop(drop_list, axis = 1)
X_test1

# PCA - Principle Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(0.9)
pca.fit(X_train1)

X_train_pca = pca.transform(X_train1)
X_test_pca = pca.transform(X_test1)

corr_pc = pd.DataFrame(X_train_pca).corr()


In [None]:
exp_var_pca = pca.explained_variance_ratio_
cum_sum_eigenvalues = np.cumsum(exp_var_pca)

# Create the visualization plot
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, 
        alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, 
         where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
print(X_train1.shape)
print(X_test1.shape)
print (y_train1.shape)
print(y_test1.shape)

# Run regression models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor


In [None]:
def models_automation(models, X_train1, y_train1,X_test1,y_test1):
    for model in models:
        model.fit(X_train1, y_train1)
        print(f"{model.__class__.__name__}: Train -> {model.score(X_train1, y_train1)}, Test -> {model.score(X_test1, y_test1)}")


In [None]:
model_list = [LinearRegression(),SGDRegressor(),KNeighborsRegressor(), MLPRegressor(),DecisionTreeRegressor(),RandomForestRegressor()]
models_automation(model_list, X_train1, y_train1,X_test1,y_test1)

### Optimize parameters for models

In [None]:
model_list = [LinearRegression(),SGDRegressor(),KNeighborsRegressor(n_neighbors =9), MLPRegressor(solver = 'lbfgs', max_iter = 1500),DecisionTreeRegressor(criterion="poisson"),
              RandomForestRegressor(min_samples_split = 7,
                                    min_samples_leaf = 6,
                                    max_samples = 0.4,
                                    max_depth = 14,
                                    n_jobs = -1)]
models_automation(model_list, X_train1, y_train1,X_test1,y_test1)

## RFE option, not used.

In [None]:
# from sklearn.feature_selection import RFE
# from sklearn.tree import DecisionTreeClassifier
# # define the method
# rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=30)
# # fit the model
# rfe.fit(X_train, y_train)
# # transform the data
# X, y = rfe.transform(X_train, y_train)

In [None]:
# RandomForestRegressor(min_samples_split = 7,
#                                     min_samples_leaf = 6,
#                                     max_samples = 0.4,
#                                     max_depth = 14,
#                                     n_jobs = -1))