# Predictive Modeling
---

## Libraries

In [56]:
import sklearn
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

from sklearn import tree

from sklearn.ensemble import RandomForestClassifier

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn import cross_validation 
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict

## Import Data and Repeat Data Cleanup

In [24]:
bank_marketing = pd.read_csv("bank_marketing.csv")
processed_bank_marketing = bank_marketing.drop(columns=['default', 'poutcome', 'contact', 'month'])
processed_bank_marketing['total_contact'] = processed_bank_marketing['previous'] + processed_bank_marketing['campaign']
processed_bank_marketing = processed_bank_marketing.drop(columns=['previous', 'campaign', 'pdays'])
binary_response_map = {'yes':1, 'no':0}
processed_bank_marketing['success'] = processed_bank_marketing['y'].map(binary_response_map)
processed_bank_marketing['housing'] = processed_bank_marketing['housing'].map(binary_response_map)
processed_bank_marketing['loan'] = processed_bank_marketing['loan'].map(binary_response_map)
processed_bank_marketing = processed_bank_marketing.drop(columns=['y'])
processed_bank_marketing=pd.get_dummies(data=processed_bank_marketing, columns=['job','marital','education'], 
              prefix=['job','marital','education'])

## Train Model

### Decision Tree

In [50]:
test_size = 0.2
seed = 5
data = processed_bank_marketing.values
X = data[:,:-1] # Features
X
Y = data[:, -1] # Dependent Variable
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
dt2 = tree.DecisionTreeClassifier(random_state=1, max_depth=2)
dt2.fit(X_train, y_train)
dt2_score_train = dt2.score(X_train, y_train)
print("Training score: ",dt2_score_train)
dt2_score_test = dt2.score(X_test, y_test)
print("Testing score: ",dt2_score_test)

Training score:  0.9582411504424779
Testing score:  0.9602209944751381


### Random Forest

In [48]:
rf = RandomForestClassifier(n_estimators = 50, random_state = seed)
rf.fit(X_train, y_train)
rf_pred=rf.predict(X_test)
print('Accuracy:',metrics.accuracy_score(y_test,rf_pred))
print('Precision:', metrics.precision_score(y_test,rf_pred,average='macro'))
print('Recall:', metrics.recall_score(y_test,rf_pred,average='macro'))

Accuracy: 0.9977900552486187
Precision: 0.9988518943742825
Recall: 0.9722222222222222


### Gaussian Naive Bayes

In [51]:
clf = GaussianNB()
clf.fit(X_train,y_train)
pred=clf.predict(X_test)
acc=accuracy_score(pred, y_test)
print('Accuracy:',metrics.accuracy_score(y_test,pred))
print('Precision:', metrics.precision_score(y_test,pred,average='macro'))
print('Recall:', metrics.recall_score(y_test,pred,average='macro'))

Accuracy: 0.9911602209944751
Precision: 0.9954389965792474
Recall: 0.8888888888888888


### Cross Validation

In [76]:
clf = GaussianNB()

In [83]:
print(cross_val_score(clf, X_train, y_train, scoring='accuracy', cv = 10))

[0.98898072 0.99447514 0.98895028 1.         0.99171271 0.99445983
 0.99445983 0.99722992 0.99168975 0.99168975]


In [78]:
accuracy = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv = 10).mean() * 100

In [79]:
accuracy

99.33647923672983