In [30]:
import pandas as pd 
import numpy as np

from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split 

In [4]:
df = pd.read_csv('_train_sem09__1_.zip')
print(df.shape)
X = df.drop(columns=['Activity'])
y = df['Activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 1 )

(3751, 1777)


In [7]:
# Logistic regression
lr = linear_model.LogisticRegression(random_state= 1, solver = 'liblinear')
lr.fit(X_train, y_train)
print('Train f1 score {:.2f}'.format(metrics.f1_score(lr.predict(X_train), y_train)))
print('Test f1 score {:.2f}'.format(metrics.f1_score(lr.predict(X_test), y_test)))

Train f1 score 0.90
Test f1 score 0.78


In [8]:
# Logistic regression
rf = ensemble.RandomForestClassifier(random_state = 1)
rf.fit(X_train, y_train)
print('Train f1 score {:.2f}'.format(metrics.f1_score(rf.predict(X_train), y_train)))
print('Test f1 score {:.2f}'.format(metrics.f1_score(rf.predict(X_test), y_test)))

Train f1 score 1.00
Test f1 score 0.82


# Grid Search

##  Logistic Regression 

In [11]:
grid_params = {
    'penalty': ['l2', 'none'] ,       #тип регурялизации
    'solver': ['lbfgs', 'saga'],          #алгоритм оптимизации
    'C': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1]   
     }

grid_search = GridSearchCV(
    estimator=linear_model.LogisticRegression(
        random_state=42, #генератор случайных чисел
        max_iter=1000 #количество итераций на сходимость
    ), 
    param_grid=grid_params, 
    cv=5, 
    n_jobs = -1
)
grid_search.fit(X_train, y_train)
print('Train f1 score {:.2f}'.format(metrics.f1_score(grid_search.predict(X_train), y_train)))
print('Test f1 score {:.2f}'.format(metrics.f1_score(grid_search.predict(X_test), y_test)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search.best_params_))

accuracy на тестовом наборе: 0.78
f1_score на тестовом наборе: 0.80
Наилучшие значения гиперпараметров: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}


## Random Forest

In [15]:
grid_params = {
    'max_depth': list(np.linspace(1, 30, 10, dtype=int)),
    'min_samples_split': list(np.linspace(0.01, 1, 10, dtype=float))
    }


grid_search = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(
        random_state = 1), 
    param_grid=grid_params, 
    cv=5, 
    n_jobs = -1
)
grid_search.fit(X_train, y_train)
print('Train f1 score {:.2f}'.format(metrics.f1_score(grid_search.predict(X_train), y_train)))
print('Test f1 score {:.2f}'.format(metrics.f1_score(grid_search.predict(X_test), y_test)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search.best_params_))

Train f1 score 0.92
Test f1 score 0.80
Наилучшие значения гиперпараметров: {'max_depth': 17, 'min_samples_split': 0.01}


# Randomized Search

## Logistic Regression

In [16]:
param_grid = {'penalty': ['l2', 'none'] ,
              'solver': ['lbfgs', 'sag'],
               'C': list(np.linspace(0.01, 1, 20, dtype=float))},
            
random_search = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42, max_iter=1000), 
    param_distributions=param_grid, 
    cv=5, 
    n_iter = 50, 
    n_jobs = -1
)  
random_search.fit(X_train, y_train) 
print('Train f1 score {:.2f}'.format(metrics.f1_score(random_search.predict(X_train), y_train)))
print('Test f1 score {:.2f}'.format(metrics.f1_score(random_search.predict(X_test), y_test)))
print("Наилучшие значения гиперпараметров: {}".format(random_search.best_params_))



Train f1 score 0.86
Test f1 score 0.80
Наилучшие значения гиперпараметров: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.12}


## Random Forest

In [None]:
param_grid = {'min_samples_leaf': list(np.linspace(0.01, 1, 50, dtype=float)),
              'max_depth': list(np.linspace(1, 30, 50, dtype=int)),
              'criterion':['entropy','gini']
              }
            
random_search = RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(
        random_state = 1), 
    param_distributions=param_grid, 
    cv=5,
    n_iter = 50, 
    n_jobs = -1
)  

random_search.fit(X_train, y_train) 
print("accuracy на тестовом наборе: {:.2f}".format(random_search.score(X_test, y_test)))
y_test_pred = random_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search.best_params_))

# Hyperopt

In [60]:
from sklearn.model_selection import cross_val_score
import hyperopt
import imp
imp.reload(hyperopt)
from hyperopt import hp, fmin, tpe, Trials

## Logistic Regression

## Random Forest

In [64]:
# зададим пространство поиска гиперпараметров
space={'n_estimators': hp.quniform('n_estimators', 100, 200, 1),
       'max_depth' : hp.quniform('max_depth', 5, 40, 1)
      }

# зафксируем random_state
random_state = 1
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    # функция получает комбинацию гиперпараметров в "params"
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth'])
              }
  
    # используем эту комбинацию для построения модели
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)

    # обучаем модель
    model.fit(X, y)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="f1", n_jobs=-1).mean()
    
    # метрику необходимо минимизировать, поэтому ставим знак минус
    return -score

# начинаем подбор гиперпараметров
trials = Trials() # используется для логирования результатов

best=fmin(hyperopt_rf, # наша функция 
          space=space, # пространство гиперпараметров
          algo=tpe.suggest, # алгоритм оптимизации, установлен по умолчанию, задавать необязательно
          max_evals=20, # максимальное количество итераций
          trials=trials, # логирование результатов
          rstate=np.random.RandomState(random_state)# фиксируем для повторяемости результата
         )
print("Наилучшие значения гиперпараметров {}".format(best))

# рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth'])
)
model.fit(X_train, y_train)

print('Train f1 score {:.2f}'.format(metrics.f1_score(model.predict(X_train), y_train)))
print('Test f1 score {:.2f}'.format(metrics.f1_score(model.predict(X_test), y_test)))



100%|██████████| 40/40 [07:06<00:00, 10.67s/trial, best loss: -0.8275894426137052]
Наилучшие значения гиперпараметров {'max_depth': 17.0, 'n_estimators': 160.0}
Train f1 score 1.00
Test f1 score 0.83


# Optuna

In [17]:
import optuna

## Logistic Regression

In [58]:
def objective(trial):
      C = trial.suggest_float('C', 0.0001, 10000)
      clf = linear_model.LogisticRegression(C= C,
                                            penalty= 'l2',
                                            solver= 'sag',
                                            random_state=1)
      return model_selection.cross_val_score(clf, X_train, y_train, 
           n_jobs=-1, cv=5, scoring = 'f1').mean()
      
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)


# рассчитаем точность для тестовой выборки
model = linear_model.LogisticRegression(**study.best_params,random_state=1)
model.fit(X_train, y_train)
print('Train f1 score {:.2f}'.format(metrics.f1_score(model.predict(X_train), y_train)))
print('Test f1 score {:.2f}'.format(metrics.f1_score(model.predict(X_test), y_test)))
# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))

[32m[I 2022-05-16 19:01:47,667][0m A new study created in memory with name: no-name-f65b838e-c83c-4593-839f-eb875fded475[0m
[32m[I 2022-05-16 19:01:54,178][0m Trial 0 finished with value: 0.7727286003913194 and parameters: {'C': 5989.1825656620895}. Best is trial 0 with value: 0.7727286003913194.[0m
[32m[I 2022-05-16 19:02:03,569][0m Trial 1 finished with value: 0.7727286003913194 and parameters: {'C': 8430.476469645715}. Best is trial 0 with value: 0.7727286003913194.[0m
[32m[I 2022-05-16 19:02:13,504][0m Trial 2 finished with value: 0.7727286003913194 and parameters: {'C': 3649.1895053594553}. Best is trial 0 with value: 0.7727286003913194.[0m
[32m[I 2022-05-16 19:02:23,332][0m Trial 3 finished with value: 0.7727286003913194 and parameters: {'C': 3774.4769629131306}. Best is trial 0 with value: 0.7727286003913194.[0m
[32m[I 2022-05-16 19:02:33,379][0m Trial 4 finished with value: 0.7727286003913194 and parameters: {'C': 1861.6214088715903}. Best is trial 0 with value

Train f1 score 0.91
Test f1 score 0.74
Наилучшие значения гиперпараметров {'C': 5989.1825656620895}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Random Forest

In [49]:
def objective(trial):
      n_estimators = trial.suggest_int('n_estimators', 100, 300)
      max_depth = trial.suggest_int('max_depth', 2, 40, 1)
      clf = ensemble.RandomForestClassifier(n_estimators=n_estimators, 
                                            max_depth=max_depth,
                                            random_state=1)
      return model_selection.cross_val_score(clf, X_train, y_train, 
           n_jobs=-1, cv=5, scoring = 'f1').mean()
      
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)


[32m[I 2022-05-16 18:47:24,424][0m A new study created in memory with name: no-name-c2339c55-7ea6-4481-8057-575d439b19e6[0m
[32m[I 2022-05-16 18:47:31,975][0m Trial 0 finished with value: 0.8166441481000823 and parameters: {'n_estimators': 261, 'max_depth': 13}. Best is trial 0 with value: 0.8166441481000823.[0m
[32m[I 2022-05-16 18:47:40,662][0m Trial 1 finished with value: 0.8219645280931633 and parameters: {'n_estimators': 269, 'max_depth': 30}. Best is trial 1 with value: 0.8219645280931633.[0m
[32m[I 2022-05-16 18:47:46,725][0m Trial 2 finished with value: 0.819027746933522 and parameters: {'n_estimators': 184, 'max_depth': 14}. Best is trial 1 with value: 0.8219645280931633.[0m
[32m[I 2022-05-16 18:47:54,906][0m Trial 3 finished with value: 0.8217119082049751 and parameters: {'n_estimators': 261, 'max_depth': 23}. Best is trial 1 with value: 0.8219645280931633.[0m
[32m[I 2022-05-16 18:47:58,621][0m Trial 4 finished with value: 0.817334997278968 and parameters: {'

In [50]:
# рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(**study.best_params,random_state=1)
model.fit(X_train, y_train)
print('Train f1 score {:.2f}'.format(metrics.f1_score(model.predict(X_train), y_train)))
print('Test f1 score {:.2f}'.format(metrics.f1_score(model.predict(X_test), y_test)))
# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))

Train f1 score 1.00
Test f1 score 0.82
Наилучшие значения гиперпараметров {'n_estimators': 269, 'max_depth': 30}


Используя метод подбора гиперпараметров hyperopt и случайный лес, нам удалось улучшить значение целевой метрики до 0.83.