# Predictive Modeling
---

## Libraries

In [1]:
import sklearn
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

from sklearn import tree

from sklearn.ensemble import RandomForestClassifier

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn import cross_validation 
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict



## Import Data and Repeat Data Cleanup

### Dataset 1

In [2]:
bank_marketing = pd.read_csv("bank_marketing.csv")
processed_bank_marketing = bank_marketing.drop(columns=['default', 'poutcome', 'contact', 'month'])
processed_bank_marketing['total_contact'] = processed_bank_marketing['previous'] + processed_bank_marketing['campaign']
processed_bank_marketing = processed_bank_marketing.drop(columns=['previous', 'campaign', 'pdays'])
binary_response_map = {'yes':1, 'no':0}
processed_bank_marketing['success'] = processed_bank_marketing['y'].map(binary_response_map)
processed_bank_marketing['housing'] = processed_bank_marketing['housing'].map(binary_response_map)
processed_bank_marketing['loan'] = processed_bank_marketing['loan'].map(binary_response_map)
processed_bank_marketing = processed_bank_marketing.drop(columns=['y'])
processed_bank_marketing=pd.get_dummies(data=processed_bank_marketing, columns=['job','marital','education'], 
              prefix=['job','marital','education'])
processed_bank_marketing.head(5)

Unnamed: 0,age,balance,housing,loan,day,duration,total_contact,success,job_admin.,job_blue-collar,...,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown
0,30,1787,0,0,19,79,1,0,0,0,...,0,1,0,0,1,0,1,0,0,0
1,33,4789,1,1,11,220,5,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,35,1350,1,0,16,185,2,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,30,1476,1,1,3,199,4,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,59,0,1,0,5,226,1,0,0,1,...,0,0,0,0,1,0,0,1,0,0


### Dataset 2

In [22]:
bank_marketing = pd.read_csv("bank_marketing.csv")
processed_bank_marketing = bank_marketing.drop(columns=['default', 'poutcome', 'contact', 'month'])
processed_bank_marketing['total_contact'] = processed_bank_marketing['previous'] + processed_bank_marketing['campaign']
processed_bank_marketing = processed_bank_marketing.drop(columns=['previous', 'campaign', 'pdays'])
binary_response_map = {'yes':1, 'no':0}
processed_bank_marketing['success'] = processed_bank_marketing['y'].map(binary_response_map)
processed_bank_marketing['housing'] = processed_bank_marketing['housing'].map(binary_response_map)
processed_bank_marketing['loan'] = processed_bank_marketing['loan'].map(binary_response_map)
processed_bank_marketing = processed_bank_marketing.drop(columns=['y', 'job', 'marital', 'education'])
processed_bank_marketing.head(5)

Unnamed: 0,age,balance,housing,loan,day,duration,total_contact,success
0,30,1787,0,0,19,79,1,0
1,33,4789,1,1,11,220,5,0
2,35,1350,1,0,16,185,2,0
3,30,1476,1,1,3,199,4,0
4,59,0,1,0,5,226,1,0


## Train Model

In [23]:
test_size = 0.2
seed = 5
Y = processed_bank_marketing.success.values
X = processed_bank_marketing.drop(columns=['success']).values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)

### Decision Tree

In [97]:
dt2 = tree.DecisionTreeClassifier(random_state=1, max_depth=2)
dt2.fit(X_train, y_train)
dt2_score_train = dt2.score(X_train, y_train)
print("Training score: ",dt2_score_train)
dt2_score_test = dt2.score(X_test, y_test)
print("Testing score: ",dt2_score_test)

Training score:  0.8940818584070797
Testing score:  0.8795580110497238


### Random Forest

In [98]:
rf = RandomForestClassifier(n_estimators = 50, random_state = seed)
rf.fit(X_train, y_train)
rf_pred=rf.predict(X_test)
print('Accuracy:',metrics.accuracy_score(y_test,rf_pred))
print('Precision:', metrics.precision_score(y_test,rf_pred,average='macro'))
print('Recall:', metrics.recall_score(y_test,rf_pred,average='macro'))

Accuracy: 0.8839779005524862
Precision: 0.6709693748418122
Recall: 0.5494835543183649


### Gaussian Naive Bayes

In [8]:
clf = GaussianNB()
clf.fit(X_train,y_train)
pred=clf.predict(X_test)
acc=accuracy_score(pred, y_test)
print('Accuracy:',metrics.accuracy_score(y_test,pred))
print('Precision:', metrics.precision_score(y_test,pred,average='macro'))
print('Recall:', metrics.recall_score(y_test,pred,average='macro'))

Accuracy: 0.8519337016574585
Precision: 0.6277639751552795
Recall: 0.6255707762557078


### Cross Validation

In [100]:
clf = GaussianNB()

In [9]:
print(cross_val_score(clf, X_train, y_train, scoring='accuracy', cv = 10))

[0.85635359 0.85359116 0.82320442 0.84530387 0.85911602 0.8480663
 0.83149171 0.86426593 0.86703601 0.84722222]


In [102]:
accuracy = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv = 10).mean() * 100

In [103]:
accuracy

84.956512331036

### Helper Functions

In [24]:
def train_and_cross_val(classifier, X_train, y_train, cv=10):
    accuracy = cross_val_score(classifier, X_train, y_train, scoring='accuracy', cv = cv).mean() * 100
    precision = cross_val_score(classifier, X_train, y_train, scoring='precision', cv = cv).mean() * 100
    recall = cross_val_score(classifier, X_train, y_train, scoring='recall', cv = cv).mean() * 100
    return accuracy, precision, recall

In [25]:
clf = GaussianNB()
accuracy, precision, recall = train_and_cross_val(clf, X_train, y_train)

In [26]:
precision

51.036831208455915

In [27]:
accuracy

88.57843807444364

In [28]:
recall

34.3321718931475

In [31]:
rf = RandomForestClassifier(n_estimators = 50, random_state = seed)
accuracy, precision, recall = train_and_cross_val(rf, X_train, y_train)

In [33]:
accuracy

88.24579726188594

In [34]:
precision

48.900415338549216

In [35]:
recall

20.261324041811847