# Сравнение моделей

## Загрузка набора данных и занижение тестового до 452 примеров по каждому классу

In [3]:
# Загрузка модели и наборов данных
import numpy as np
import pandas as pd

#тренировочный набор
train_df = pd.read_csv('Preprocessed_train_data_selection.csv', low_memory=True)
#тестовый набор
test_df = pd.read_csv('Preprocessed_test_data_selection.csv', low_memory=True)

y_train, x_train = train_df["Label"], train_df.drop(columns=["Label"]).copy()

y_test, x_test = test_df["Label"], test_df.drop(columns=["Label"]).copy()


In [4]:
# применение undersampling к тестовой выборке
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
#занижаю классы тестового набора до количество примеров самого наименьшего (452)

rus = RandomUnderSampler(random_state=42)

#автоматическое преобразование всех классов к количеству самого маленького
x_test_balanced, y_test_balanced = rus.fit_resample(x_test, y_test)


print(Counter(y_test_balanced))

Counter({0: 452, 1: 452, 2: 452, 3: 452, 4: 452, 5: 452, 6: 452, 7: 452, 8: 452, 9: 452, 10: 452})


## Сравнение моделей

### Инициализация моделей

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pickle
import time

#Загрузка созданных моделей с дефотными и подобранными гиперпараметрами
with open('model_rfc_default.pkl', 'rb') as file:
    rfc_def_model = pickle.load(file)

with open('model_rfc_optimized.pkl', 'rb') as file:
    rfc_opt_model = pickle.load(file)


all_models = {
    "logistic Regression": LogisticRegression(max_iter=1000, n_jobs=-1),
    "Decision tree": DecisionTreeClassifier(),
    "Random forest - default Params": rfc_def_model,
    "Random Forest - Optimized Params": rfc_opt_model,
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
    "LightGBM": LGBMClassifier(n_jobs=-1),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(n_jobs=-1),
    "Naive Bayes": GaussianNB()
}

results = []

for name, model in all_models.items():
    print(f'training {name} ...')
    
    start_time = time.time()
    model.fit(x_train, y_train)
    end_time = time.time()

    duration_time = end_time - start_time
    
    y_pred = model.predict(x_test_balanced)

    acc = accuracy_score(y_test_balanced, y_pred)
    f1 = f1_score(y_test_balanced, y_pred, average='macro')
    
    results.append([name, acc, f1, duration_time])
    print(f'{name} / accuracy: {acc:.4f} / f1-score: {f1:.4f}')
    print(classification_report(y_test_balanced, y_pred))
    

training logistic Regression ...
logistic Regression / accuracy: 0.4562 / f1-score: 0.3669
              precision    recall  f1-score   support

           0       0.22      0.63      0.33       452
           1       0.00      0.00      0.00       452
           2       0.45      0.89      0.60       452
           3       0.76      0.75      0.76       452
           4       0.85      0.87      0.86       452
           5       0.59      0.71      0.65       452
           6       0.73      0.17      0.27       452
           7       0.00      0.00      0.00       452
           8       0.40      1.00      0.57       452
           9       0.04      0.00      0.00       452
          10       0.00      0.00      0.00       452

    accuracy                           0.46      4972
   macro avg       0.37      0.46      0.37      4972
weighted avg       0.37      0.46      0.37      4972

training Decision tree ...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Decision tree / accuracy: 0.9366 / f1-score: 0.9382
              precision    recall  f1-score   support

           0       0.62      0.98      0.76       452
           1       0.99      0.63      0.77       452
           2       1.00      1.00      1.00       452
           3       1.00      0.99      1.00       452
           4       0.98      1.00      0.99       452
           5       1.00      0.99      0.99       452
           6       1.00      1.00      1.00       452
           7       1.00      1.00      1.00       452
           8       1.00      1.00      1.00       452
           9       0.93      0.83      0.88       452
          10       0.99      0.88      0.93       452

    accuracy                           0.94      4972
   macro avg       0.95      0.94      0.94      4972
weighted avg       0.95      0.94      0.94      4972

training Random forest - default Params ...
Random forest - default Params / accuracy: 0.9785 / f1-score: 0.9786
              precisio

Parameters: { "use_label_encoder" } are not used.



XGBoost / accuracy: 0.9397 / f1-score: 0.9407
              precision    recall  f1-score   support

           0       0.64      0.97      0.77       452
           1       0.99      0.64      0.78       452
           2       1.00      1.00      1.00       452
           3       1.00      1.00      1.00       452
           4       0.98      1.00      0.99       452
           5       1.00      0.99      1.00       452
           6       1.00      1.00      1.00       452
           7       1.00      1.00      1.00       452
           8       1.00      1.00      1.00       452
           9       0.93      0.84      0.88       452
          10       0.97      0.89      0.93       452

    accuracy                           0.94      4972
   macro avg       0.96      0.94      0.94      4972
weighted avg       0.96      0.94      0.94      4972

training LightGBM ...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011332 seconds.
You can set `fo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[WinError 2] Не удается найти указанный файл
  File "C:\Users\MeizekiN\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\MeizekiN\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\MeizekiN\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\MeizekiN\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


KNN / accuracy: 0.8481 / f1-score: 0.8562
              precision    recall  f1-score   support

           0       0.43      0.96      0.59       452
           1       0.99      0.62      0.77       452
           2       0.77      0.98      0.87       452
           3       1.00      0.99      0.99       452
           4       0.96      0.77      0.85       452
           5       0.99      0.97      0.98       452
           6       0.99      0.96      0.97       452
           7       0.99      0.99      0.99       452
           8       0.98      0.96      0.97       452
           9       1.00      0.52      0.68       452
          10       0.99      0.61      0.76       452

    accuracy                           0.85      4972
   macro avg       0.92      0.85      0.86      4972
weighted avg       0.92      0.85      0.86      4972

training Naive Bayes ...
Naive Bayes / accuracy: 0.3580 / f1-score: 0.3329
              precision    recall  f1-score   support

           0   

## Сравнение моделей

In [13]:
result_models_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Macro_F1", "Time"])
print(result_models_df.sort_values(by="Macro_F1", ascending=False))

                              Model  Accuracy  Macro_F1         Time
2    Random forest - default Params  0.978479  0.978550     7.574879
5                           XGBoost  0.939662  0.940663     8.824879
4                 Gradient Boosting  0.939863  0.940515  1096.577016
3  Random Forest - Optimized Params  0.938656  0.938797    49.943717
1                     Decision tree  0.936645  0.938156     3.356018
8                               KNN  0.848150  0.856230     0.066000
6                          LightGBM  0.478882  0.490797     4.376038
7                               SVM  0.514481  0.488726  7336.722956
0               logistic Regression  0.456154  0.366942   125.983646
9                       Naive Bayes  0.358005  0.332884     0.163003


## Сохранение результата сравнения в файл

In [19]:
result_models_df.to_csv("Models_compare.csv")
print("Сравнение моделей успешно сохранено")

Сравнение моделей успешно сохранено
