<a href="https://colab.research.google.com/github/IvanNeverov/DepositsProject/blob/main/Modelling%20TDA%20Response.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Импортируем библиотеки

In [87]:
!pip install catboost -q

In [88]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier, Pool

import seaborn as sb

## Загружаем данные, делаем первичный анализ

Значения переменных yes/no заранее заменил на 1и0 

In [89]:
data = pd.read_excel('/dataset.xlsx')

In [None]:
data.head(6)

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.isna().sum()

In [None]:
data.describe()

In [None]:
data.describe(include='object')

In [None]:
corr = data.corr() # строим матрицу корреляций

corr.style.background_gradient(cmap='coolwarm')

## Строим логистическую регрессию без категориальных переменных

Этап моделирования 

In [97]:
X = data.drop('deposit', axis=1) # матрица объект-признак
y = data['deposit'] # целевая переменная

Сносим категориальные переменные

In [98]:
X_num = X.drop(['job','marital','education','contact','month','poutcome'], axis=1)

Разбиваем на выборки

In [99]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

Xtrain_num, Xtest_num, ytrain_num, ytest_num = train_test_split(X_num, y, test_size=0.25, random_state=42)

Нормируем данные

In [105]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(Xtrain_num)

Xtrain = pd.DataFrame(ss.transform(Xtrain_num), columns=X_num.columns)
Xtest = pd.DataFrame(ss.transform(Xtest_num), columns=X_num.columns)

Xtrain.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous
0,1.581951,-0.122118,2.083507,1.059464,2.545739,0.284318,-0.785527,1.662174,-0.474357,-0.365571
1,0.8233,8.188828,-0.500178,1.059464,-0.392813,-0.545702,-0.258692,-0.183877,1.979568,2.348973
2,-1.115476,8.188828,-0.812078,1.059464,-0.392813,-1.138573,0.982105,-0.183877,-0.474357,-0.365571
3,-0.946887,-0.122118,0.925652,1.059464,-0.392813,0.64004,0.561789,0.554543,-0.474357,-0.365571
4,-0.862592,-0.122118,0.321128,-0.943873,-0.392813,-0.071405,-0.342179,-0.553088,1.225928,0.991701


Оцениваем модель

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression() # объявление модели
model.fit(Xtrain_num, ytrain_num) # обучение модели

In [None]:
pred_test = model.predict(Xtest_num) # предсказание классов
pred_test_proba = model.predict_proba (Xtest_num)
pred_test

Оцениваем результаты

In [None]:
accuracy_score(ytest_num, pred_test)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sb


conf_mx = confusion_matrix(ytest_num, pred_test)
sb.heatmap(conf_mx,annot=True,fmt='4.0f')
plt.title('Confusion Matrix', size = 12)
plt.xlabel ('Predicted values')
plt.ylabel ('Actual values')
conf_mx

In [None]:
ytest_num.value_counts()

In [None]:
from sklearn.metrics import recall_score

recall_score(ytest_num, pred_test)

In [None]:
from sklearn.metrics import precision_score

precision_score(ytest_num, pred_test)

In [None]:
from sklearn.metrics import f1_score

f1_score(ytest_num, pred_test)

In [None]:
coef_table = pd.DataFrame({'features' : list(X_num.columns), 'weights' : list(model.coef_[0])})
coef_table.sort_values(by='weights')

## Обучаем KNN

In [116]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

In [None]:
knn.fit(Xtrain_num, ytrain_num)
pred_test_knn = knn.predict(Xtest_num)
pred_test2 = knn.predict_proba(Xtest_num)[:,1]

classes2 = pred_test2 > 0.5

conf_mx2 = confusion_matrix(ytest_num, classes2)
sb.heatmap(conf_mx2,annot=True,fmt='4.0f')
plt.title('Confusion Matrix', size = 12)
plt.xlabel ('Predicted values')
plt.ylabel ('Actual values')
conf_mx2

Оцениваем KNN

In [None]:
from sklearn.metrics import recall_score

recall_score(ytest_num, classes2)

In [None]:
from sklearn.metrics import precision_score

precision_score(ytest_num, classes2)

In [None]:
from sklearn.metrics import f1_score

f1_score(ytest_num, classes2)

##Dummy Variables

Оборачиваем категориальные переменные в Dummy при помощи OHE

In [121]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

Xtrain, Xtest, Ytrain, Ytest = train_test_split(data.drop('deposit', axis=1), data.deposit, test_size=0.25, random_state=42)
X = data.drop(['deposit','job','marital','contact','poutcome','education','month'], axis =1)
categorical = ['job','marital','education','contact','month','poutcome']

column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), categorical),
    ('scaling', StandardScaler(), X.columns)
])

X_transformed = column_transformer.fit_transform(Xtrain)
X_test_transformed = column_transformer.transform(Xtest)

##Linear Regression with Dummies

Обучаем Логистическую регрессию с категориальными переменными

In [122]:
model2 = LogisticRegression()
model2.fit(X_transformed,Ytrain)
ohe_LgRg = model2.predict(X_test_transformed)
ohe_LgRg_proba = model2.predict_proba(X_test_transformed)

Оцениваем качество

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

print('Accuracy',round(accuracy_score(Ytest,ohe_LgRg),2))
print('Precision',round(precision_score(Ytest,ohe_LgRg),2))
print('Recall',round(recall_score(Ytest,ohe_LgRg),2))
print('ROC AUC',round(roc_auc_score(Ytest,ohe_LgRg_proba [:,1]),2))

## Boost

Обучаем градиентный бустинг

In [124]:
cat = CatBoostClassifier(iterations=1000,
                         depth = 3,
                         learning_rate = 0.01,
                         loss_function='Logloss',
                         eval_metric = 'AUC',
                         verbose=False)
cat.fit(X_transformed, Ytrain)
ohe_boost = cat.predict(X_test_transformed)   
ohe_boost_proba = cat.predict_proba(X_test_transformed)   

Оцениваем результаты

In [None]:
print('Accuracy',round(accuracy_score(Ytest,ohe_boost),2))
print('Precision',round(precision_score(Ytest,ohe_boost),2))
print('Recall',round(recall_score(Ytest,ohe_boost),2))
print('ROC AUC',round(roc_auc_score(Ytest,ohe_boost_proba [:,1]),2))

## Results and Summary

Сводим результаты моделей в одну таблицу

In [126]:
resulting_metrics = pd.DataFrame({
    'Accuracy':[accuracy_score(Ytest,pred_test), accuracy_score(Ytest,ohe_LgRg),accuracy_score(Ytest,pred_test_knn),accuracy_score(Ytest,ohe_boost)],
    'Precision': [precision_score(Ytest,pred_test), precision_score(Ytest,ohe_LgRg), precision_score(Ytest,pred_test_knn),precision_score(Ytest,ohe_boost)],
    'Recall': [recall_score(Ytest,pred_test), recall_score(Ytest,ohe_LgRg), recall_score(Ytest,pred_test_knn),recall_score(Ytest,ohe_boost)],
    'ROC AUC': [roc_auc_score(Ytest,pred_test_proba[:,1]),roc_auc_score(Ytest,ohe_LgRg_proba [:,1]),roc_auc_score(Ytest,pred_test2),roc_auc_score(Ytest,ohe_boost_proba [:,1])]    
}, index= ['LogReg','OHE_LogReg','KNN','OHE_Boost'])
resulting_metrics

Unnamed: 0,Accuracy,Precision,Recall,ROC AUC
LogReg,0.754927,0.76259,0.711409,0.842588
OHE_LogReg,0.811537,0.822134,0.775541,0.902381
KNN,0.747761,0.743688,0.724832,0.800223
OHE_Boost,0.836976,0.82383,0.840418,0.914893
