In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
base = pd.read_csv('../data/bank-full.csv_v2', sep=',')
base

Unnamed: 0,age,education,default,balance,housing,loan,day,month,duration,campaign,...,unemployed,divorced,married,single,cellular,telephone,failure,other,success,y
0,58,3,0,2143,1,0,5,5,261,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,no
1,44,2,0,29,1,0,5,5,151,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,no
2,33,2,0,2,1,1,5,5,76,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,no
3,47,0,0,1506,1,0,5,5,92,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,no
4,33,0,0,1,0,0,5,5,198,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,3,0,825,0,0,17,11,977,3,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,yes
45207,71,1,0,1729,0,0,17,11,456,2,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,yes
45208,72,2,0,5715,0,0,17,11,1127,5,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,yes
45209,57,2,0,668,0,0,17,11,508,4,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,no


In [3]:
X = base.iloc[:,0:31]
y = base.iloc[:,31]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

In [5]:
rfc = RandomForestClassifier(n_jobs=-1)

In [6]:
n_estimators = [int(i) for i in np.linspace(start=500, stop=1000, num=20)]
max_depth = [int(i) for i in np.linspace(10, 100, num=5)]
max_depth.append(None)
min_samples_split = [2, 3, 4]
min_samples_leaf = [2, 3, 4]

parm_space = {'n_estimators':n_estimators,
                'max_depth':max_depth,
                'min_samples_split':min_samples_split,
                'min_samples_leaf':min_samples_leaf}


In [7]:
randomSearch = RandomizedSearchCV(estimator=rfc,
                                    param_distributions=parm_space,
                                    n_iter=10,
                                    cv=5,
                                    verbose=3,
                                    random_state=0)
randomSearch.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END max_depth=77, min_samples_leaf=4, min_samples_split=3, n_estimators=605;, score=0.906 total time=   2.8s
[CV 2/5] END max_depth=77, min_samples_leaf=4, min_samples_split=3, n_estimators=605;, score=0.902 total time=   1.8s
[CV 3/5] END max_depth=77, min_samples_leaf=4, min_samples_split=3, n_estimators=605;, score=0.903 total time=   1.9s
[CV 4/5] END max_depth=77, min_samples_leaf=4, min_samples_split=3, n_estimators=605;, score=0.907 total time=   1.9s
[CV 5/5] END max_depth=77, min_samples_leaf=4, min_samples_split=3, n_estimators=605;, score=0.905 total time=   1.8s
[CV 1/5] END max_depth=77, min_samples_leaf=2, min_samples_split=2, n_estimators=1000;, score=0.905 total time=   3.1s
[CV 2/5] END max_depth=77, min_samples_leaf=2, min_samples_split=2, n_estimators=1000;, score=0.903 total time=   3.0s
[CV 3/5] END max_depth=77, min_samples_leaf=2, min_samples_split=2, n_estimators=1000;, score=0.906 total time=

In [8]:
randomSearch.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_depth': 77}

In [9]:
model = RandomForestClassifier(n_estimators=1000, 
                                min_samples_split=2, 
                                min_samples_leaf=2, 
                                max_depth=77,
                                n_jobs=-1)

In [10]:
Kfold = KFold(n_splits=10, shuffle=True, random_state=0)
scores = cross_val_score(model, X, y, cv=Kfold)

In [11]:
scores

array([0.90601504, 0.89648308, 0.90621544, 0.90024331, 0.90356116,
       0.90378235, 0.90953329, 0.90422473, 0.90776377, 0.90842734])

In [12]:
model.fit(X_train, y_train)

In [13]:
predict = model.predict(X_test)

In [14]:
evaluation = accuracy_score(y_test, predict)
evaluation

0.9009178370009953

In [15]:
confusion = confusion_matrix(y_test, predict)
confusion

array([[7753,  227],
       [ 669,  394]], dtype=int64)

In [16]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

          no       0.92      0.97      0.95      7980
         yes       0.63      0.37      0.47      1063

    accuracy                           0.90      9043
   macro avg       0.78      0.67      0.71      9043
weighted avg       0.89      0.90      0.89      9043



In [18]:
#pickle.dump(model, open('../model/model.pkl', 'wb'))