In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from warnings import filterwarnings
filterwarnings('ignore')

## Load data

In [2]:
df = pd.read_csv('Data_Preprocessed.csv')
df.shape

(283043, 33)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['kilometer']), df['kilometer'],
                                                    test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((226434, 32), (56609, 32))

In [4]:
# В качестве baseline возьмем наивный классификатор, его качество будет около 76%
df['kilometer'].value_counts(normalize=True)

1    0.762121
0    0.237879
Name: kilometer, dtype: float64

In [5]:
metrics = pd.DataFrame({'Model':['KNN', 'LogReg', 'NN'],
                        'Accuracy':[0.86, 0.85, 0.86],
                        'Precision':[0.88, 0.87, 0.89],
                        'Recall':[0.95, 0.95, 0.95],
                        'F1':[0.91, 0.91, 0.91]})
display(metrics)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,KNN,0.86,0.88,0.95,0.91
1,LogReg,0.85,0.87,0.95,0.91
2,NN,0.86,0.89,0.95,0.91


## Random Forest

#### some things to pass hw

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
oob_errors = {"sqrt":[], "log2":[], None:[]}

for max_feature in ["sqrt", "log2", None]:
    for n_estimators in tqdm(np.arange(15, 176, 15)):
        model = RandomForestClassifier(oob_score=True, warm_start=True, random_state=42,
                                       max_features=max_feature, n_estimators=n_estimators)
        model.fit(X_train, y_train)
        oob_errors[max_feature].append(1 - round(model.oob_score_, 3))

100%|██████████| 11/11 [05:20<00:00, 29.18s/it]
100%|██████████| 11/11 [05:29<00:00, 29.98s/it]
100%|██████████| 11/11 [20:07<00:00, 109.74s/it]


In [30]:
def objective(trial):
    # params
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 10, 30)
    max_features = trial.suggest_categorical('max_features', ["sqrt", "log2", None])
    classifier = RandomForestClassifier(random_state=42, criterion=criterion, n_estimators=n_estimators, n_jobs=-1,
                                        max_depth=max_depth, max_features=max_features, class_weight='balanced')
    classifier.fit(X_train, y_train)
    score = f1_score(y_test, classifier.predict(X_test))
    return score

In [31]:
%%time
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=30*60, show_progress_bar=True)

[32m[I 2021-02-28 20:18:00,145][0m A new study created in memory with name: no-name-4996675d-8d81-4b0c-bd19-cf3b5f9cb143[0m


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))

[32m[I 2021-02-28 20:18:56,508][0m Trial 0 finished with value: 0.9168938250467459 and parameters: {'criterion': 'gini', 'n_estimators': 149, 'max_depth': 24, 'max_features': None}. Best is trial 0 with value: 0.9168938250467459.[0m
[32m[I 2021-02-28 20:19:13,180][0m Trial 1 finished with value: 0.9179385642360129 and parameters: {'criterion': 'gini', 'n_estimators': 169, 'max_depth': 24, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.9179385642360129.[0m
[32m[I 2021-02-28 20:19:19,904][0m Trial 2 finished with value: 0.8907269835995537 and parameters: {'criterion': 'entropy', 'n_estimators': 102, 'max_depth': 11, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.9179385642360129.[0m
[32m[I 2021-02-28 20:19:34,743][0m Trial 3 finished with value: 0.9040788412303159 and parameters: {'criterion': 'gini', 'n_estimators': 181, 'max_depth': 16, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.9179385642360129.[0m
[32m[I 2021-02-28 20:19:41,507][0m Trial 4 fi

[32m[I 2021-02-28 20:27:23,688][0m Trial 34 finished with value: 0.9174147624229233 and parameters: {'criterion': 'entropy', 'n_estimators': 104, 'max_depth': 24, 'max_features': 'sqrt'}. Best is trial 31 with value: 0.9186846341765453.[0m
[32m[I 2021-02-28 20:27:41,376][0m Trial 35 finished with value: 0.9185511711853542 and parameters: {'criterion': 'gini', 'n_estimators': 165, 'max_depth': 27, 'max_features': 'sqrt'}. Best is trial 31 with value: 0.9186846341765453.[0m
[32m[I 2021-02-28 20:27:52,452][0m Trial 36 finished with value: 0.9184900486609802 and parameters: {'criterion': 'entropy', 'n_estimators': 100, 'max_depth': 29, 'max_features': 'sqrt'}. Best is trial 31 with value: 0.9186846341765453.[0m
[32m[I 2021-02-28 20:28:02,725][0m Trial 37 finished with value: 0.9182633072219019 and parameters: {'criterion': 'entropy', 'n_estimators': 89, 'max_depth': 25, 'max_features': 'sqrt'}. Best is trial 31 with value: 0.9186846341765453.[0m
[32m[I 2021-02-28 20:28:14,485]

[32m[I 2021-02-28 20:37:21,633][0m Trial 69 finished with value: 0.9192450286484665 and parameters: {'criterion': 'gini', 'n_estimators': 180, 'max_depth': 28, 'max_features': 'sqrt'}. Best is trial 68 with value: 0.919333093654503.[0m
[32m[I 2021-02-28 20:38:36,615][0m Trial 70 finished with value: 0.9173167277386867 and parameters: {'criterion': 'gini', 'n_estimators': 179, 'max_depth': 26, 'max_features': None}. Best is trial 68 with value: 0.919333093654503.[0m
[32m[I 2021-02-28 20:38:56,300][0m Trial 71 finished with value: 0.9190446231968722 and parameters: {'criterion': 'gini', 'n_estimators': 175, 'max_depth': 28, 'max_features': 'sqrt'}. Best is trial 68 with value: 0.919333093654503.[0m
[32m[I 2021-02-28 20:39:18,021][0m Trial 72 finished with value: 0.9192109815551213 and parameters: {'criterion': 'gini', 'n_estimators': 197, 'max_depth': 28, 'max_features': 'sqrt'}. Best is trial 68 with value: 0.919333093654503.[0m
[32m[I 2021-02-28 20:39:42,429][0m Trial 73 

KeyboardInterrupt: 

In [None]:
cross_val_score(classifier, X_train, y_train, n_jobs=-1, cv=5, scoring='f1_macro')

## Gradient Boosting

In [None]:
import xg