In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt import space_eval
import optuna

In [2]:
data = pd.read_csv('_train_sem09 (1).csv')

In [3]:
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Разделение на признаки и целевую переменную
X = data.iloc[:, 1:]
y = data['Activity']

# Разделение на обучающий и тестовый наборы данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Логистическая регрессия
logistic_regression = LogisticRegression(max_iter=1000)

# Подбор гиперпараметров с помощью GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(logistic_regression, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)
logistic_regression_best_params_grid = grid_search.best_params_

# Подбор гиперпараметров с помощью RandomizedSearchCV
param_dist = {'C': [0.001, 0.01, 0.1, 1, 10]}
randomized_search = RandomizedSearchCV(logistic_regression, param_distributions=param_dist, n_iter=10, cv=5)
randomized_search.fit(X_train_scaled, y_train)
logistic_regression_best_params_randomized = randomized_search.best_params_

# Случайный лес
random_forest = RandomForestClassifier()

# Подбор гиперпараметров с помощью Hyperopt
space = {'n_estimators': hp.choice('n_estimators', range(10, 200)),
         'max_depth': hp.choice('max_depth', range(1, 20)),
         'min_samples_split': hp.choice('min_samples_split', range(2, 10))}
trials = Trials()
def hyperopt_objective(params):
    random_forest.set_params(**params)
    score = cross_val_score(random_forest, X_train_scaled, y_train, cv=5, scoring='f1_macro').mean()
    return {'loss': -score, 'status': STATUS_OK}

best_hyperopt = fmin(hyperopt_objective, space, algo=tpe.suggest, max_evals=50, trials=trials)
random_forest_best_params_hyperopt = space_eval(space, best_hyperopt)

# Подбор гиперпараметров с помощью Optuna
def optuna_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    random_forest.set_params(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
    score = cross_val_score(random_forest, X_train_scaled, y_train, cv=5, scoring='f1_macro').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(optuna_objective, n_trials=50)
random_forest_best_params_optuna = study.best_params

# Обучение и оценка моделей с подобранными гиперпараметрами
logistic_regression_grid = LogisticRegression(**logistic_regression_best_params_grid, max_iter=1000)
logistic_regression_grid.fit(X_train_scaled, y_train)
y_pred_lr_grid = logistic_regression_grid.predict(X_test_scaled)
f1_lr_grid = f1_score(y_test, y_pred_lr_grid)

logistic_regression_randomized = LogisticRegression(**logistic_regression_best_params_randomized, max_iter=1000)
logistic_regression_randomized.fit(X_train_scaled, y_train)
y_pred_lr_randomized = logistic_regression_randomized.predict(X_test_scaled)
f1_lr_randomized = f1_score(y_test, y_pred_lr_randomized)

random_forest_hyperopt = RandomForestClassifier(**random_forest_best_params_hyperopt)
random_forest_hyperopt.fit(X_train_scaled, y_train)
y_pred_rf_hyperopt = random_forest_hyperopt.predict(X_test_scaled)
f1_rf_hyperopt = f1_score(y_test, y_pred_rf_hyperopt)

random_forest_optuna = RandomForestClassifier(**random_forest_best_params_optuna)
random_forest_optuna.fit(X_train_scaled, y_train)
y_pred_rf_optuna = random_forest_optuna.predict(X_test_scaled)
f1_rf_optuna = f1_score(y_test, y_pred_rf_optuna)

print("F1-score для логистической регрессии с GridSearchCV: ", f1_lr_grid)
print("F1-score для логистической регрессии с RandomizedSearchCV: ", f1_lr_randomized)
print("F1-score для случайного леса с Hyperopt: ", f1_rf_hyperopt)
print("F1-score для случайного леса с Optuna: ", f1_rf_optuna)



100%|██████████| 50/50 [06:35<00:00,  7.91s/trial, best loss: -0.7985861254681378]

[I 2023-06-26 14:21:21,536] A new study created in memory with name: no-name-7b73cfe2-df8c-4cc6-9d30-c97fe16abd5c





[I 2023-06-26 14:21:24,455] Trial 0 finished with value: 0.6985267568384472 and parameters: {'n_estimators': 101, 'max_depth': 2, 'min_samples_split': 8}. Best is trial 0 with value: 0.6985267568384472.
[I 2023-06-26 14:21:27,093] Trial 1 finished with value: 0.7083344928658394 and parameters: {'n_estimators': 69, 'max_depth': 3, 'min_samples_split': 5}. Best is trial 1 with value: 0.7083344928658394.
[I 2023-06-26 14:21:30,023] Trial 2 finished with value: 0.7125502321653444 and parameters: {'n_estimators': 75, 'max_depth': 3, 'min_samples_split': 5}. Best is trial 2 with value: 0.7125502321653444.
[I 2023-06-26 14:21:32,233] Trial 3 finished with value: 0.7141074022553751 and parameters: {'n_estimators': 55, 'max_depth': 3, 'min_samples_split': 4}. Best is trial 3 with value: 0.7141074022553751.
[I 2023-06-26 14:21:35,586] Trial 4 finished with value: 0.7897120010073114 and parameters: {'n_estimators': 39, 'max_depth': 12, 'min_samples_split': 5}. Best is trial 4 with value: 0.789712

F1-score для логистической регрессии с GridSearchCV:  0.8041237113402062
F1-score для логистической регрессии с RandomizedSearchCV:  0.8041237113402062
F1-score для случайного леса с Hyperopt:  0.8364055299539169
F1-score для случайного леса с Optuna:  0.8371559633027522
