In [1]:
### pipelines

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [3]:
import pandas as pd
import numpy as np
import os 
path = os.getcwd()
print(path)

/Users/dang/Desktop/Learning/Mygit/datascience_supplychain_practice/Jupyternotebook_source/Section_24_Machine_learning


In [4]:
banking = pd.read_csv('bank-full.csv')

In [5]:
dict_target = {'yes':1, 'no': 0}

In [6]:
banking['target'] = banking['y'].map(dict_target)

In [7]:
banking = banking.drop('y', axis=1)

In [8]:
y  = banking['target'].values

In [9]:
X_ = banking.drop('target', axis=1)
X = pd.get_dummies(X_).values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [11]:
p_lr = Pipeline([('Imputing', SimpleImputer(missing_values=np.nan, strategy='mean')),
('scaling', StandardScaler()),
('logistic', LogisticRegression())
])

In [14]:
p_rf = Pipeline([('Imputing', SimpleImputer(missing_values = np.nan, strategy='mean')),
('scaling', StandardScaler()),
('rf', RandomForestClassifier())
])

In [15]:
p_svc = Pipeline([('Imputing', SimpleImputer(missing_values=np.nan, strategy='mean')),
('scaling', StandardScaler()),
('SVC', SVC())
])

In [17]:
p_KNN = Pipeline([('Imputing', SimpleImputer(missing_values=np.nan, strategy='mean')),
('scaling', StandardScaler()),
('knn', KNeighborsClassifier())
])

In [18]:
p_KNN

Pipeline(steps=[('Imputing', SimpleImputer()), ('scaling', StandardScaler()),
                ('knn', KNeighborsClassifier())])

In [19]:
param_range = [1,2,3,4,5,6,7,8,9,10]
lr_range = np.logspace(-5,5,15)

In [20]:
grid_logistic = [{'logistic__penalty': ['l1', 'l2'],
'logistic__C': lr_range,
'logistic__solver': ['liblinear']
}]

In [21]:
grid_rf = [{'rf_criterion': ['gini', 'entropy'],
'rf__min_samples_leaf': param_range
}]

In [22]:
grid_svc = [{'SVC__kernel': ['linear', 'rbf'],
'SVC__C': param_range
}]

In [23]:
grid_knn = [{'knn__n_neighbors': param_range}]

In [24]:
pipes = [p_lr, p_KNN]

In [25]:
grids = [grid_logistic, grid_knn]

In [26]:
fitted_prams = []
fitted_score = []
fitted_roc = []
n_jobs = -1

In [27]:
for i in range(0,2):
    model = GridSearchCV(pipes[i], grids[i], cv=3, scoring='accuracy', verbose=10)
    model.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)[:,1]
    roc = roc_auc_score(y_test, y_pred_prob)
    fitted_prams.append(model.best_params_)
    fitted_score.append(model.best_score_)
    fitted_roc.append(roc)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV 1/3; 1/30] START logistic__C=1e-05, logistic__penalty=l1, logistic__solver=liblinear
[CV 1/3; 1/30] END logistic__C=1e-05, logistic__penalty=l1, logistic__solver=liblinear;, score=0.883 total time=   0.2s
[CV 2/3; 1/30] START logistic__C=1e-05, logistic__penalty=l1, logistic__solver=liblinear
[CV 2/3; 1/30] END logistic__C=1e-05, logistic__penalty=l1, logistic__solver=liblinear;, score=0.883 total time=   0.1s
[CV 3/3; 1/30] START logistic__C=1e-05, logistic__penalty=l1, logistic__solver=liblinear
[CV 3/3; 1/30] END logistic__C=1e-05, logistic__penalty=l1, logistic__solver=liblinear;, score=0.883 total time=   0.1s
[CV 1/3; 2/30] START logistic__C=1e-05, logistic__penalty=l2, logistic__solver=liblinear
[CV 1/3; 2/30] END logistic__C=1e-05, logistic__penalty=l2, logistic__solver=liblinear;, score=0.886 total time=   0.2s
[CV 2/3; 2/30] START logistic__C=1e-05, logistic__penalty=l2, logistic__solver=liblinear
[CV 2/3; 2/30]

In [28]:
fitted_prams

[{'logistic__C': 0.0013894954943731374,
  'logistic__penalty': 'l2',
  'logistic__solver': 'liblinear'},
 {'knn__n_neighbors': 7}]

In [29]:
fitted_score

[0.9012386640123866, 0.894907100199071]

In [30]:
fitted_roc

[0.911073737139044, 0.8238139324289705]

In [31]:
from random import randint

In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [33]:
param_dist = {"max_depth": [3,None],
"min_samples_leaf": range(1,9),
"criterion": ["gini", "entropy"]
}

In [35]:
tree = DecisionTreeClassifier()

In [46]:
rf = RandomForestClassifier()

In [37]:
tree.fit(X_train, y_train)
tree.score(X_train,y_train)
predict_tree = tree.predict_proba(X_test)[:,1]

In [38]:
roc_auc_score(y_test, predict_tree)

0.6981533246352463

In [41]:
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

In [47]:
rf_cv = RandomizedSearchCV(rf, param_dist, cv=5)

In [48]:
tree_cv.fit(X_train, y_train)
rf_cv.fit(X_train,y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, None],
                                        'min_samples_leaf': range(1, 9)})

In [44]:
tree_cv.best_score_

0.9000497749367866

In [49]:
rf_cv.best_score_

0.9062153575976785

In [50]:
cv_tree_predict_prob = tree_cv.predict_proba(X_test)[:,1]

In [51]:
roc_auc_score(y_test,cv_tree_predict_prob )


0.7595652743769846

In [52]:
cv_rf_predict_prob=rf_cv.predict_proba(X_test)[:,1]

In [53]:
roc_auc_score(y_test,cv_rf_predict_prob )


0.9289618892721607