# Импорт библиотек и настройка путей

In [1]:
import sys
from dotenv import load_dotenv


sys.path.append('..')
load_dotenv()

True

In [2]:
import numpy as np

from modules.data.get_data import get_data
from modules.data.transform_data import transform_data

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, RocCurveDisplay
from sklearn.model_selection import GridSearchCV

# Загрузка данных

In [3]:
data = get_data(is_train=True)

Начало экспорта набора данных, с параметром is_train=True
Тренировочный набор данных успешно экспортирован
       ID Customer_ID     Month           Name   Age          SSN Occupation  \
0  0x1602   CUS_0xd40   January  Aaron Maashoh    23  821-00-0265  Scientist   
1  0x1603   CUS_0xd40  February  Aaron Maashoh    23  821-00-0265  Scientist   
2  0x1604   CUS_0xd40     March  Aaron Maashoh  -500  821-00-0265  Scientist   
3  0x1605   CUS_0xd40     April  Aaron Maashoh    23  821-00-0265  Scientist   
4  0x1606   CUS_0xd40       May  Aaron Maashoh    23  821-00-0265  Scientist   

  Annual_Income  Monthly_Inhand_Salary  Num_Bank_Accounts  ...  Credit_Mix  \
0      19114.12            1824.843333                  3  ...           _   
1      19114.12                    NaN                  3  ...        Good   
2      19114.12                    NaN                  3  ...        Good   
3      19114.12                    NaN                  3  ...        Good   
4      19114.12       

In [4]:
X, y = transform_data(
    dataset=data,
    is_train=True
)

Преобразование данных прошло успешно
    Age  Annual_Income  Monthly_Inhand_Salary  Num_Bank_Accounts  \
0  23.0       19114.12                   1824                  3   
1  23.0       19114.12                   3093                  3   
2  33.0       19114.12                   3093                  3   
3  23.0       19114.12                   3093                  3   
4  23.0       19114.12                   1824                  3   

   Num_Credit_Card  Interest_Rate  Num_of_Loan  Delay_from_due_date  \
0                4              3          4.0                  3.0   
1                4              3          4.0                  0.0   
2                4              3          4.0                  3.0   
3                4              3          4.0                  5.0   
4                4              3          4.0                  6.0   

   Num_of_Delayed_Payment  Changed_Credit_Limit  ...  Outstanding_Debt  \
0                     7.0                 11.27  ... 

# Разделение данных

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    shuffle=True,
    test_size=0.33,
    random_state=21
)

# Обучение базовых моделей

### Логистическая регрессия

In [33]:
logreg = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(LogisticRegression()))
])

In [34]:
logreg.fit(X_train, y_train)

In [35]:
logreg.score(X_valid, y_valid)

0.6244

### Метод опорных векторов

In [36]:
SVC_model = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(LinearSVC()))
])

In [37]:
SVC_model.fit(X_train, y_train)



In [38]:
SVC_model.score(X_valid, y_valid)

0.62475

### Дерево

In [40]:
single_tree = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(DecisionTreeClassifier(random_state=42)))
])

In [41]:
single_tree.fit(X_train, y_train)

In [42]:
single_tree.score(X_valid, y_valid)

0.65005

### Лес OvsR

In [7]:
OVR = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(RandomForestClassifier(random_state=42)))
])

In [8]:
OVR.fit(X_train, y_train)

In [9]:
OVR.score(X_valid, y_valid)

0.7802424242424243

In [10]:
precision_score(OVR.predict(X_valid), y_valid.values, average='weighted')

0.7800882991712293

In [11]:
recall_score(OVR.predict(X_valid), y_valid.values, average='weighted')

0.7802424242424243

### Precision и recall практически идентичны, так что можно использовать f1-score в качестве нашей метрики

In [31]:
f1_score(OVR.predict(X_valid), y_valid.values, average='weighted')

0.787905519716533

### Лес OvsO

In [36]:
OVO = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsOneClassifier(RandomForestClassifier()))
])

In [37]:
OVO.fit(X_train, y_train)

In [38]:
precision_score(OVO.predict(X_valid), y_valid.values, average='weighted')

0.7755882855576605

In [39]:
recall_score(OVO.predict(X_valid), y_valid.values, average='weighted')

0.7751515151515151

In [40]:
f1_score(OVO.predict(X_valid), y_valid.values, average='weighted')

0.7753415388808937

# Проверим качество оценок моделей случайного леса на кросс-валидации

In [31]:
print(f'Оценка OVR на CV: {np.mean(cross_val_score(OVR, X, y, cv=3))}')

Оценка OVR на CV: 0.6952300584384616


In [32]:
print(f'Оценка OVO на CV: {np.mean(cross_val_score(OVO, X, y, cv=3))}')

Оценка OVO на CV: 0.6920900654377636


### Подводя итоги, мы выбираем модель случайного леса OVR для нашей задачи. Дальше мы будем работать именно с ней

# Обучение с параметрами

In [47]:
OVR_model = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(RandomForestClassifier(class_weight='balanced', max_depth=60, n_estimators=1500, random_state=42)))
])

In [48]:
OVR_model.fit(X=X_train, y=y_train)

In [49]:
OVR_model.score(X_valid, y_valid)

0.7802727272727272

In [50]:
precision_score(
    y_true=y_valid,
    y_pred=OVR_model.predict(X_valid),
    average='weighted'
)

0.779750265926971

In [51]:
recall_score(
    y_true=y_valid,
    y_pred=OVR_model.predict(X_valid),
    average='weighted'
)

0.7802727272727272

In [52]:
f1_score(
    y_true=y_valid,
    y_pred=OVR_model.predict(X_valid),
    average='weighted'
)

0.7799273776749307