### Выбор модели для классификации событий изменения уровня топлива

In [1]:
#imports
import pandas as pd
import numpy as np

from sklearn import model_selection, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
%run "constants-and-functions.ipynb"
%run "problems1_2-preparation.ipynb"

In [3]:
df_X

Unnamed: 0,dtime_start,dtime_end,delta_seconds,tachometer_start,tachometer_end,delta_fuellevel_abs,delta_fuellevel_sign,lps_abs
0,2020-01-09 10:05:26,2020-01-09 10:06:27,61.0,1248,1056,0.2,1.0,0.003279
1,2020-01-09 10:06:27,2020-01-09 10:07:27,60.0,1056,960,0.4,1.0,0.006667
2,2020-01-09 10:07:27,2020-01-09 10:08:27,60.0,960,864,0.1,1.0,0.001667
3,2020-01-09 10:08:27,2020-01-09 10:09:27,60.0,864,864,0.3,-1.0,0.005000
4,2020-01-09 10:09:27,2020-01-09 10:10:27,60.0,864,864,0.4,-1.0,0.006667
...,...,...,...,...,...,...,...,...
18364,2020-06-27 00:47:05,2020-06-27 00:48:05,60.0,832,832,0.4,1.0,0.006667
18365,2020-06-27 00:48:05,2020-06-27 00:49:05,60.0,832,832,0.2,1.0,0.003333
18366,2020-06-27 00:49:05,2020-06-27 01:15:14,1569.0,832,1408,0.2,-1.0,0.000127
18367,2020-06-27 01:15:14,2020-06-27 01:16:14,60.0,1408,928,0.2,1.0,0.003333


In [4]:
df_Y

Unnamed: 0,refuel
0,False
1,False
2,False
3,False
4,False
...,...
18364,False
18365,False
18366,False
18367,False


In [5]:
X = df_X[['lps_abs', 'delta_seconds', 'tachometer_start', 'tachometer_end']].to_numpy()
Y = df_Y.astype(int)['refuel'].to_numpy()

X

array([[3.27868852e-03, 6.10000000e+01, 1.24800000e+03, 1.05600000e+03],
       [6.66666667e-03, 6.00000000e+01, 1.05600000e+03, 9.60000000e+02],
       [1.66666667e-03, 6.00000000e+01, 9.60000000e+02, 8.64000000e+02],
       ...,
       [1.27469726e-04, 1.56900000e+03, 8.32000000e+02, 1.40800000e+03],
       [3.33333333e-03, 6.00000000e+01, 1.40800000e+03, 9.28000000e+02],
       [3.33333333e-03, 6.00000000e+01, 9.28000000e+02, 8.32000000e+02]])

In [6]:
Y

array([0, 0, 0, ..., 0, 0, 0])

Метрики

In [7]:
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

Разбиение на 5 блоков для кросс-валидации

In [8]:
kf = model_selection.KFold(n_splits=5, shuffle=True)

SVM

In [9]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('clf', SVC())
])

Параметры SVM

In [10]:
grid = dict(clf__C=np.power(10.0, np.arange(-3, 2)),
            clf__kernel = ['linear', 'rbf'])

Обучение и выбор гиперпараметров

In [11]:
gs = model_selection.GridSearchCV(estimator=pipe,
                                  param_grid=grid,
                                  scoring=scoring_metrics,
                                  refit='recall',
                                  n_jobs=4,
                                  verbose=5)
gs.fit(X, Y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   16.4s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  1.5min finished


GridSearchCV(estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('clf', SVC())]),
             n_jobs=4,
             param_grid={'clf__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01]),
                         'clf__kernel': ['linear', 'rbf']},
             refit='recall', scoring=['accuracy', 'precision', 'recall', 'f1'],
             verbose=5)

In [12]:
results = pd.DataFrame(gs.cv_results_)

results = results[[x for x in results.columns if (x =='params' or 'mean' in x)]]

results

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1
0,4.246398,0.313021,"{'clf__C': 0.001, 'clf__kernel': 'linear'}",0.641845,0.663773,0.528547,0.581327
1,6.235203,0.850802,"{'clf__C': 0.001, 'clf__kernel': 'rbf'}",0.644023,0.690358,0.477265,0.555814
2,3.851319,0.290205,"{'clf__C': 0.01, 'clf__kernel': 'linear'}",0.640702,0.657805,0.538462,0.585376
3,6.143337,0.690287,"{'clf__C': 0.01, 'clf__kernel': 'rbf'}",0.644023,0.688833,0.480228,0.557599
4,3.872332,0.288404,"{'clf__C': 0.1, 'clf__kernel': 'linear'}",0.641083,0.657888,0.540513,0.586578
5,6.166754,0.737922,"{'clf__C': 0.1, 'clf__kernel': 'rbf'}",0.643969,0.685422,0.487635,0.561621
6,5.476666,0.267588,"{'clf__C': 1.0, 'clf__kernel': 'linear'}",0.640648,0.657192,0.540171,0.5861
7,6.671911,0.685284,"{'clf__C': 1.0, 'clf__kernel': 'rbf'}",0.645112,0.690802,0.480342,0.558243
8,12.627517,0.288404,"{'clf__C': 10.0, 'clf__kernel': 'linear'}",0.640811,0.657366,0.540399,0.58631
9,9.450078,0.656864,"{'clf__C': 10.0, 'clf__kernel': 'rbf'}",0.646637,0.695357,0.477493,0.557425


In [13]:
gs.best_estimator_

Pipeline(steps=[('scale', StandardScaler()),
                ('clf', SVC(C=0.1, kernel='linear'))])

SVM показывает плохие результаты, мб стоит будет попробовать c PCA

Градиентный бустинг над решающими деревьями

In [14]:
clfGBoost = GradientBoostingClassifier()

Параметры бустинга

In [15]:
grid = {'n_estimators':[50, 100, 150],
        'learning_rate':[10, 1, 0.1],
        'max_depth':[3, 5, 9]}

Обучение и выбор гиперпараметров


In [16]:
gs = model_selection.GridSearchCV(estimator=clfGBoost,
                                  param_grid=grid,
                                  scoring=scoring_metrics,
                                  refit='recall',
                                  n_jobs=4,
                                  verbose=5)
gs.fit(X, Y)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   20.2s
[Parallel(n_jobs=4)]: Done 135 out of 135 | elapsed:   49.9s finished


GridSearchCV(estimator=GradientBoostingClassifier(), n_jobs=4,
             param_grid={'learning_rate': [10, 1, 0.1], 'max_depth': [3, 5, 9],
                         'n_estimators': [50, 100, 150]},
             refit='recall', scoring=['accuracy', 'precision', 'recall', 'f1'],
             verbose=5)

In [17]:

results = pd.DataFrame(gs.cv_results_)

results = results[[x for x in results.columns if (x =='params' or 'mean' in x)]]

results

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1
0,0.349645,0.007205,"{'learning_rate': 10, 'max_depth': 3, 'n_estim...",0.455347,0.471922,0.563191,0.505067
1,0.791957,0.010008,"{'learning_rate': 10, 'max_depth': 3, 'n_estim...",0.455347,0.471922,0.563191,0.505067
2,1.040734,0.011008,"{'learning_rate': 10, 'max_depth': 3, 'n_estim...",0.455347,0.471922,0.563191,0.505067
3,0.530974,0.008806,"{'learning_rate': 10, 'max_depth': 5, 'n_estim...",0.537107,0.550729,0.57812,0.520008
4,1.064752,0.013609,"{'learning_rate': 10, 'max_depth': 5, 'n_estim...",0.537107,0.550729,0.57812,0.520008
5,1.583718,0.016015,"{'learning_rate': 10, 'max_depth': 5, 'n_estim...",0.51212,0.532797,0.668376,0.554148
6,1.410195,0.015611,"{'learning_rate': 10, 'max_depth': 9, 'n_estim...",0.50961,0.49417,0.609003,0.53576
7,2.683894,0.022619,"{'learning_rate': 10, 'max_depth': 9, 'n_estim...",0.519191,0.511103,0.599886,0.536874
8,4.165141,0.030625,"{'learning_rate': 10, 'max_depth': 9, 'n_estim...",0.519956,0.509223,0.591225,0.533278
9,0.325026,0.007405,"{'learning_rate': 1, 'max_depth': 3, 'n_estima...",0.634823,0.667213,0.481595,0.552371


In [18]:
gs.best_params_


{'learning_rate': 10, 'max_depth': 5, 'n_estimators': 150}