# Импорт библиотек и настройка путей

In [1]:
import sys


sys.path.append('..')

In [39]:
import os
from sqlalchemy import create_engine

import pandas as pd
import numpy as np
import scipy.stats as st 
from modules.data_preprocessing import DataProcessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

# Загрузка данных

In [3]:
engine = create_engine(os.getenv('POSTGRESQL_KEY'))
connection = engine.connect()

In [4]:
train = pd.read_sql(
    '''
    SELECT *
    FROM train_data
    ''', 
    con=connection
)

test = pd.read_sql(
    '''
    SELECT *
    FROM test_data
    ''', 
    con=connection
)

In [5]:
connection.close()

# Разделение данных

In [6]:
X, y = DataProcessing(train, is_train=True).transform()
processed_test = DataProcessing(test, is_train=False).transform()

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

# Обучение базовых моделей

### Логистическая регрессия

In [33]:
logreg = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(LogisticRegression()))
])

In [34]:
logreg.fit(X_train, y_train)

In [35]:
logreg.score(X_valid, y_valid)

0.6244

### Метод опорных векторов

In [36]:
SVC_model = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(LinearSVC()))
])

In [37]:
SVC_model.fit(X_train, y_train)



In [38]:
SVC_model.score(X_valid, y_valid)

0.62475

### Дерево

In [40]:
single_tree = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(DecisionTreeClassifier(random_state=42)))
])

In [41]:
single_tree.fit(X_train, y_train)

In [42]:
single_tree.score(X_valid, y_valid)

0.65005

### Лес OvsR

In [18]:
OVR = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(RandomForestClassifier(random_state=42)))
])

In [19]:
OVR.fit(X_train, y_train)

In [20]:
OVR.score(X_valid, y_valid)

0.7879

In [21]:
precision_score(OVR.predict(X_valid), y_valid.values, average='weighted')

0.7879425598842134

In [22]:
recall_score(OVR.predict(X_valid), y_valid.values, average='weighted')

0.7879

### Precision и recall практически идентичны, так что можно использовать f1-score в качестве нашей метрики

In [23]:
f1_score(OVR.predict(X_valid), y_valid.values, average='weighted')

0.787905519716533

### Лес OvsO

In [24]:
OVO = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsOneClassifier(RandomForestClassifier()))
])

In [25]:
OVO.fit(X_train, y_train)

In [26]:
precision_score(OVO.predict(X_valid), y_valid.values, average='weighted')

0.7834261451403534

In [27]:
recall_score(OVO.predict(X_valid), y_valid.values, average='weighted')

0.7828

In [28]:
f1_score(OVO.predict(X_valid), y_valid.values, average='weighted')

0.7830575325338245

# Проверим качество оценок моделей случайного леса на кросс-валидации

In [31]:
print(f'Оценка OVR на CV: {np.mean(cross_val_score(OVR, X, y, cv=3))}')

Оценка OVR на CV: 0.6952300584384616


In [32]:
print(f'Оценка OVO на CV: {np.mean(cross_val_score(OVO, X, y, cv=3))}')

Оценка OVO на CV: 0.6920900654377636
