In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree, naive_bayes, svm

In [3]:
df = pd.read_csv('data/bank_additional_preprocessed.csv', sep=';')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 70 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41188 non-null  int64  
 1   campaign                       41188 non-null  int64  
 2   pdays                          41188 non-null  int64  
 3   emp.var.rate                   41188 non-null  float64
 4   cons.price.idx                 41188 non-null  float64
 5   cons.conf.idx                  41188 non-null  float64
 6   euribor3m                      41188 non-null  float64
 7   nr.employed                    41188 non-null  float64
 8   y                              41188 non-null  int64  
 9   job_admin.                     41188 non-null  int64  
 10  job_blue-collar                41188 non-null  int64  
 11  job_entrepreneur               41188 non-null  int64  
 12  job_housemaid                  41188 non-null 

### Разделим данные на обучение и тест, нормализуем их

In [5]:
X = df.drop('y', axis=1)
y = df.y

Для удобства обработки разделим признаки на категориальные и вещественные

In [6]:
numeric_cols = np.array(['age', 'campaign', 'pdays', 'emp.var.rate', 'cons.price.idx', 
           'cons.conf.idx', 'euribor3m', 'nr.employed'])
X_numeric = X.drop(numeric_cols, axis=1)

In [7]:
categorical_cols = list(set(X.columns.values.tolist()) - set(numeric_cols))
X_categorical = X[categorical_cols]
for col in categorical_cols:
    X_categorical[col] = X_categorical[col].astype('string')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[col] = X_categorical[col].astype('string')


In [8]:
print(X_categorical.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 61 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   education_illiterate           41188 non-null  string
 1   previous_1                     41188 non-null  string
 2   month_nov                      41188 non-null  string
 3   job_blue-collar                41188 non-null  string
 4   marital_married                41188 non-null  string
 5   education_professional.course  41188 non-null  string
 6   month_oct                      41188 non-null  string
 7   education_basic.9y             41188 non-null  string
 8   default_yes                    41188 non-null  string
 9   job_technician                 41188 non-null  string
 10  loan_unknown                   41188 non-null  string
 11  poutcome_failure               41188 non-null  string
 12  job_retired                    41188 non-null  string
 13  e

Разделим данные на обучение и тест

In [9]:
X_train, X_test, X_train_cat, X_test_cat, X_train_num, X_test_num, y_train, y_test = train_test_split(X, X_categorical, X_numeric, y, test_size=0.25)

Нормализируем значения

In [10]:
scaler = StandardScaler()
scaler.fit(X_train_num)

X_train_num_sc = scaler.transform(X_train_num)
X_test_num_sc = scaler.transform(X_test_num)

Соединяем воедино все значения

In [11]:
X_train_transform = np.hstack((X_train_num_sc, X_train_cat))
X_test_transform = np.hstack((X_test_num_sc, X_test_cat))

# Обучение алгоритмов

### KNN

In [11]:
%%time

classifier_KNN = KNeighborsClassifier()
classifier_KNN.fit(X_train_transform, y_train)
y_pred_KNN = classifier_KNN.predict(X_test_transform)
score_KNN = classification_report(y_test, y_pred_KNN)

print(score_KNN)

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      9143
           1       0.49      0.24      0.32      1154

    accuracy                           0.89     10297
   macro avg       0.70      0.60      0.63     10297
weighted avg       0.86      0.89      0.87     10297

Wall time: 1min 10s


### Decision Tree

In [12]:
%%time

classifier_DTC = tree.DecisionTreeClassifier()
classifier_DTC.fit(X_train_transform, y_train)
y_pred_DTC = classifier_DTC.predict(X_test_transform)
score_DTC = classification_report(y_test, y_pred_DTC)

print(score_DTC)

              precision    recall  f1-score   support

           0       0.91      0.94      0.93      9143
           1       0.37      0.26      0.31      1154

    accuracy                           0.87     10297
   macro avg       0.64      0.60      0.62     10297
weighted avg       0.85      0.87      0.86     10297

Wall time: 1.35 s


### Naive Bayes

In [13]:
%%time

classifier_NB = naive_bayes.GaussianNB()
classifier_NB.fit(X_train_transform, y_train)
y_pred_NB = classifier_NB.predict(X_test_transform)
score_NB = classification_report(y_test, y_pred_NB)

print(score_NB)

              precision    recall  f1-score   support

           0       0.93      0.85      0.89      9143
           1       0.30      0.52      0.38      1154

    accuracy                           0.81     10297
   macro avg       0.62      0.68      0.63     10297
weighted avg       0.86      0.81      0.83     10297

Wall time: 571 ms


### Support Vector Machine

In [14]:
%%time

classifier_CVM = svm.SVC(kernel='linear')
classifier_CVM.fit(X_train_transform, y_train)
y_pred_CVM = classifier_CVM.predict(X_test_transform)
score_CVM = classification_report(y_test, y_pred_CVM)

print(score_CVM)

              precision    recall  f1-score   support

           0       0.91      0.99      0.94      9143
           1       0.64      0.20      0.30      1154

    accuracy                           0.90     10297
   macro avg       0.77      0.59      0.62     10297
weighted avg       0.88      0.90      0.87     10297

Wall time: 4min 42s


### Logistic Regression

In [15]:
%%time

classifier_LR = LogisticRegression(solver='liblinear', class_weight='balanced')
classifier_LR.fit(X_train_transform, y_train)
y_pred_LR = classifier_LR.predict(X_test_transform)
score_LR = classification_report(y_test, y_pred_LR)

print(score_LR)

              precision    recall  f1-score   support

           0       0.94      0.84      0.89      9143
           1       0.31      0.59      0.41      1154

    accuracy                           0.81     10297
   macro avg       0.63      0.71      0.65     10297
weighted avg       0.87      0.81      0.83     10297

Wall time: 1.85 s


# Подбор гиперпараметров

### KNN

In [16]:
%%time

parameters = {'n_neighbors': [1, 2, 3, 4, 5]}

classifier_KNN = KNeighborsClassifier()

clf_KNN = GridSearchCV(classifier_KNN, parameters)
clf_KNN.fit(X_train_transform, y_train)

Wall time: 12min 52s


GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5]})

In [17]:
best_estimator_KNN = clf_KNN.best_estimator_

print(best_estimator_KNN)

KNeighborsClassifier(n_neighbors=4)


### Decision Tree

In [18]:
%%time

parameters = {
    'max_depth': [1, 3, 5, 10, 100],
    'min_samples_split': [2, 3, 5, 8]
}

classifier_DTC = tree.DecisionTreeClassifier()

clf_DTC = GridSearchCV(classifier_DTC, parameters)
clf_DTC.fit(X_train_transform, y_train)

Wall time: 3min 51s


GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 3, 5, 10, 100],
                         'min_samples_split': [2, 3, 5, 8]})

In [19]:
best_estimator_DTC = clf_DTC.best_estimator_

print(best_estimator_DTC)

DecisionTreeClassifier(max_depth=3)


### Naive Bayes

In [20]:
%%time

parameters = {
    'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-5, 1e-2, 1e-1, 1, 10, 100]
}

classifier_NB = naive_bayes.GaussianNB()

clf_NB = GridSearchCV(classifier_NB, parameters)
clf_NB.fit(X_train_transform, y_train)

Wall time: 1min 48s


GridSearchCV(estimator=GaussianNB(),
             param_grid={'var_smoothing': [1e-10, 1e-09, 1e-08, 1e-05, 0.01,
                                           0.1, 1, 10, 100]})

In [21]:
best_estimator_NB = clf_NB.best_estimator_

print(best_estimator_NB)

GaussianNB(var_smoothing=10)


### Support Vector Machine

In [22]:
%%time

parameters = {
    'kernel': ['linear', 'rbf']
}

classifier_CVM = svm.SVC()

clf_CVM = GridSearchCV(classifier_CVM, parameters, cv=3)
clf_CVM.fit(X_train_transform, y_train)

Wall time: 29min 20s


GridSearchCV(cv=3, estimator=SVC(), param_grid={'kernel': ['linear', 'rbf']})

In [23]:
best_estimator_CVM = clf_CVM.best_estimator_

print(best_estimator_CVM)

SVC(kernel='linear')


### Logistic Regression

In [24]:
%%time

parameters = {
    'class_weight': ['balanced'],
    'solver': ['liblinear', 'lbfgs', 'newton-cg'],
    'penalty': ['l1', 'l2'],
}

classifier_LR = LogisticRegression()

clf_LR = GridSearchCV(classifier_LR, parameters, cv=3)
clf_LR.fit(X_train_transform, y_train)


Traceback (most recent call last):
  File "D:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "D:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "D:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.

Wall time: 42.5 s


GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'class_weight': ['balanced'], 'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'lbfgs', 'newton-cg']})

In [25]:
best_estimator_LR = clf_LR.best_estimator_

print(best_estimator_LR)

LogisticRegression(class_weight='balanced', penalty='l1', solver='liblinear')
