# Импорт библиотек и настройка путей

In [1]:
import sys


sys.path.append('..')

In [2]:
import os
from sqlalchemy import create_engine

import pandas as pd
import numpy as np
import scipy.stats as st 
from modules.data_preprocessing import DataProcessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

# Загрузка данных

In [3]:
engine = create_engine(os.getenv('POSTGRESQL_KEY'))
connection = engine.connect()

In [4]:
train = pd.read_sql(
    '''
    SELECT *
    FROM train_data
    ''', 
    con=connection
)

test = pd.read_sql(
    '''
    SELECT *
    FROM test_data
    ''', 
    con=connection
)

In [5]:
connection.close()

# Разделение данных

In [6]:
X, y = DataProcessing(train, is_train=True).transform()
processed_test = DataProcessing(test, is_train=False).transform()

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

# Обучение базовых моделей

In [8]:
logreg = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(LogisticRegression()))
])

In [9]:
logreg.fit(X_train, y_train)

In [10]:
logreg.score(X_valid, y_valid)

0.62705

In [11]:
SVC_model = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(LinearSVC()))
])

In [12]:
SVC_model.fit(X_train, y_train)



In [13]:
SVC_model.score(X_valid, y_valid)

0.6253

In [14]:
OVR = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsRestClassifier(RandomForestClassifier()))
])

In [15]:
OVR.fit(X_train, y_train)

In [16]:
OVR.score(X_valid, y_valid)

0.78495

In [17]:
precision_score(OVR.predict(X_valid), y_valid.values, average='weighted')

0.7851606566040653

In [18]:
recall_score(OVR.predict(X_valid), y_valid.values, average='weighted')

0.78495

### Precision и recall практически идентичны, так что можно использовать f1-score в качестве нашей метрики

In [19]:
f1_score(OVR.predict(X_valid), y_valid.values, average='weighted')

0.7850424140401648

In [40]:
OVO = Pipeline([
    ('Scaler', StandardScaler()),
    ('Classifier', OneVsOneClassifier(RandomForestClassifier()))
])

In [41]:
OVO.fit(X_train, y_train)

In [42]:
precision_score(OVO.predict(X_valid), y_valid.values, average='weighted')

0.7188757267220366

In [43]:
recall_score(OVO.predict(X_valid), y_valid.values, average='weighted')

0.72065

In [44]:
f1_score(OVO.predict(X_valid), y_valid.values, average='weighted')

0.7191996977866

# Проверим качество оценок модели на кросс-валидации

In [45]:
print(f'Оценка OVR на CV: {np.mean(cross_val_score(OVR, X, y, cv=5))}')

Оценка OVR на CV: 0.67622


In [46]:
print(f'Оценка OVO на CV: {np.mean(cross_val_score(OVO, X, y, cv=5))}')

Оценка OVO на CV: 0.6739
