In [1]:
import pandas as pd
from sklearn import metrics
from sklearn import ensemble  
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

In [2]:
dataset = pd.read_csv(r'D:\Fork\data-analysis\Data\Bank Marketing Data Set\bank_preprocessed.csv', sep=',')

In [3]:
dataset.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y_no,y_yes
0,30,1787,19,79,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
1,33,4789,11,220,1,339,4,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,35,1350,16,185,1,330,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,30,1476,3,199,4,-1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
4,59,0,5,226,1,-1,0,0,1,0,...,1,0,0,0,0,0,0,1,1,0


In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 52].values

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

 Метод опорных векторов (Support Vector Machines)

In [7]:
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train, y_train)

SVC()

In [8]:
y_pred = svm.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

[[798   1]
 [  7  99]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       799
           1       0.99      0.93      0.96       106

    accuracy                           0.99       905
   macro avg       0.99      0.97      0.98       905
weighted avg       0.99      0.99      0.99       905



 Бэггинг

In [9]:
%%time
bg = ensemble.BaggingClassifier(svm, n_jobs = -1)
bg.fit(X_train, y_train)
y_pred = bg.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

[[798   1]
 [  9  97]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       799
           1       0.99      0.92      0.95       106

    accuracy                           0.99       905
   macro avg       0.99      0.96      0.97       905
weighted avg       0.99      0.99      0.99       905

Wall time: 1.88 s


In [10]:
bagging_parameters_grid = { 'n_estimators' : [1, 50, 100],
                           'max_features' : range(1,10,1)
    
}
bagging_grid = GridSearchCV(bg, bagging_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
bagging_grid.fit(X_train, y_train)
bagging_grid.best_params_, bagging_grid.best_score_

y_pred = bagging_grid.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[[792   7]
 [ 93  13]]
              precision    recall  f1-score   support

           0       0.89      0.99      0.94       799
           1       0.65      0.12      0.21       106

    accuracy                           0.89       905
   macro avg       0.77      0.56      0.57       905
weighted avg       0.87      0.89      0.85       905



Бустинг

In [11]:
%%time
bs = ensemble.GradientBoostingClassifier()
bs.fit(X_train, y_train)
y_pred = bs.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

[[799   0]
 [  0 106]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       799
           1       1.00      1.00      1.00       106

    accuracy                           1.00       905
   macro avg       1.00      1.00      1.00       905
weighted avg       1.00      1.00      1.00       905

Wall time: 422 ms


In [12]:
boosting_parameters_grid = { 'learning_rate' : np.arange(0.1, 0.8, 0.1),
                           'n_estimators' : [10, 50, 100],
                            'max_depth' : range(1,10,1)

}

In [13]:
boosting_grid = GridSearchCV(bs, boosting_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
boosting_grid.fit(X_train, y_train)
boosting_grid.best_params_, boosting_grid.best_score_

y_pred = boosting_grid.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Fitting 5 folds for each of 189 candidates, totalling 945 fits
[[799   0]
 [  0 106]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       799
           1       1.00      1.00      1.00       106

    accuracy                           1.00       905
   macro avg       1.00      1.00      1.00       905
weighted avg       1.00      1.00      1.00       905



 Стекинг

In [14]:
estimators = [
('svm', SVC()),
('knn', KNeighborsClassifier(algorithm = 'brute',metric = 'manhattan', n_neighbors = 9))
]

In [15]:
%%time
sk = ensemble.StackingClassifier(estimators=estimators)
sk.fit(X_train, y_train)
y_pred = sk.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

[[798   1]
 [  4 102]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       799
           1       0.99      0.96      0.98       106

    accuracy                           0.99       905
   macro avg       0.99      0.98      0.99       905
weighted avg       0.99      0.99      0.99       905

Wall time: 1.53 s


In [16]:
stacking_parameters_grid = { 'stack_method' : ['auto', 'pred_proba', 'solution_function', 'predict'],
                            'cv' : [1, 2, 3, 4, 5, 6, 7, 8, 9]
    
}

In [17]:
stacking_grid = GridSearchCV(sk, stacking_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
stacking_grid.fit(X_train, y_train)
stacking_grid.best_params_, stacking_grid.best_score_
y_pred = stacking_grid.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
        nan 0.99529852 0.99557438        nan        nan 0.99529852
 0.995851          nan        nan 0.99529852 0.995851          nan
        nan 0.99529852 0.99557438        nan        nan 0.99529852
 0.995851          nan        nan 0.99529852 0.99557476        nan
        nan 0.99529852 0.99557476        nan        nan 0.99529852]
[[798   1]
 [  3 103]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       799
           1       0.99      0.97      0.98       106

    accuracy                           1.00       905
   macro avg       0.99      0.99      0.99       905
weighted avg       1.00      1.00      1.00       905

