In [50]:
import pickle

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

Загрузим датасеты для тренировки модели. Удалим колонку с id пользователя. При тренировке и тестировании модели она не нужна.

In [51]:
X = pd.read_csv("datas/X_train.zip").drop(['user_id'], axis=1)
X.shape

(19234, 9)

In [52]:
X.head()

Unnamed: 0,discovered,passed,started_attempt,viewed,correct,wrong,correct_ratio,started_ration,steps_tried
0,1,0,0,1,0.0,0.0,0.0,0.0,0.0
1,9,9,2,9,2.0,0.0,1.0,0.222222,2.0
2,15,15,4,20,4.0,4.0,0.5,0.2,4.0
3,1,1,0,1,2.0,2.0,0.5,0.0,2.0
4,1,1,0,1,0.0,0.0,0.0,0.0,0.0


In [53]:
y = pd.read_csv("datas/y_train.zip")
y.shape

(19234, 1)

In [54]:
y.head()

Unnamed: 0,passed_course
0,0
1,0
2,0
3,0
4,0


Отделим тренировочную и тестовую выборки для опредления итоговых параметров модели.

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Напишем функцию для поиска наилучших параметров модели.

In [56]:
def grid_search(X, y):
    """
    Поиск наилучших параметров для модели.
    Возвращает модель с настроенными параметрами, которая обучена
    на части тренировочных данных. 
    """

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    param_grid = {'n_estimators': range(10, 201, 10),
                  'max_depth': range(2, 21),
                  'criterion': ['gini', 'entropy', 'log_loss'],
                  'min_samples_split': range(2, 11),
                  'min_samples_leaf': range(1, 11)}

    # param_grid = {'n_estimators': range(10, 201, 10),
    #                'max_depth': range(2, 20)}

    estimator = RandomForestClassifier(random_state=0)

    name = 'rfc'

    # param_grid = {'max_depth': range(2, 21),
    #               'learning_rate': [0.1, 0.01, 0.001],
    #               'n_estimators': range(10, 201, 10)}

    # estimator = XGBClassifier()

    # name = 'xgb'

    grid_search = GridSearchCV(estimator=estimator,
                               param_grid=param_grid,
                               cv=5,
                               n_jobs=1,
                               verbose=10,
                               scoring='roc_auc')
    
    grid_search.fit(X_train, y_train.values.ravel())

    print(f'Наилучшие параметры модели {grid_search.best_params_}')

    model = grid_search.best_estimator_

    score = model.score(X_test, y_test.values.ravel())
    pred_proba = model.predict_proba(X_test)
    roc_score = roc_auc_score(y_test, pred_proba[:, 1])

    print(f'Точность на тестовой выборке {score:.3f}')
    print(f'ROC_AUC на тестовой выборке {roc_score:.4f}')

    return model, name


Напишем функцию для финального обучения модели.

In [57]:
def final_train_and_test(model, name, X_train, X_test, y_train, y_test):
    """
    Финальное обучение модели на полном объеме тренировочных данных
    и проверка на тестовых. Сохранение модели.
    """

    model.fit(X_train, y_train.values.ravel())

    score = model.score(X_test, y_test)
    pred_proba = model.predict_proba(X_test)
    roc_score = roc_auc_score(y_test, pred_proba[:, 1])

    print(f'Финальная точность на тестовой выборке {score:.3f}')
    print(f'Финальный ROC_AUC на тестовой выборке {roc_score:.4f}')
    
    with open(f'models/model_{name}_roc_auc_{roc_score:.4f}_.pkl', 'wb') as f:
        pickle.dump(model, f)

    print(f'Модель сохранена в файл model_{name}_roc_auc_{roc_score:.4f}_.pkl')

In [58]:
model, name = grid_search(X_train, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 1/5; 1/75] START learning_rate=0.1, max_depth=2, n_estimators=10............
[CV 1/5; 1/75] END learning_rate=0.1, max_depth=2, n_estimators=10;, score=0.881 total time=   0.0s
[CV 2/5; 1/75] START learning_rate=0.1, max_depth=2, n_estimators=10............
[CV 2/5; 1/75] END learning_rate=0.1, max_depth=2, n_estimators=10;, score=0.883 total time=   0.0s
[CV 3/5; 1/75] START learning_rate=0.1, max_depth=2, n_estimators=10............
[CV 3/5; 1/75] END learning_rate=0.1, max_depth=2, n_estimators=10;, score=0.879 total time=   0.0s
[CV 4/5; 1/75] START learning_rate=0.1, max_depth=2, n_estimators=10............
[CV 4/5; 1/75] END learning_rate=0.1, max_depth=2, n_estimators=10;, score=0.894 total time=   0.0s
[CV 5/5; 1/75] START learning_rate=0.1, max_depth=2, n_estimators=10............
[CV 5/5; 1/75] END learning_rate=0.1, max_depth=2, n_estimators=10;, score=0.879 total time=   0.0s
[CV 1/5; 2/75] START learning_rat

In [59]:
final_train_and_test(model, name, X_train, X_test, y_train, y_test)

Финальная точность на тестовой выборке 0.911
Финальный ROC_AUC на тестовой выборке 0.8899
Модель сохранена в файл model_xgb_roc_auc_0.8899_.pkl
