In [1]:
import pandas as pd 
import numpy as np 
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv('diabetes.csv')
df.head(3)

new_df=df.copy()
new_df['Age']=np.log(np.log(df['Age'])+2)
new_df['DiabetesPedigreeFunction']=np.log(df['DiabetesPedigreeFunction'])
new_df['Insulin']=np.log(df['Insulin']+20)
new_df['SkinThickness']=np.log(df['SkinThickness']+20)
new_df['Pregnancies']=np.log(df['Pregnancies']+1)


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
strat_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X=new_df.copy()
y=X.pop('Outcome')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Подбор параметров RandomForest

### Самостоятельный подбор параметров (из 5.2)

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score, precision_score,recall_score, make_scorer
from sklearn.model_selection import cross_val_score

models=[RandomForestClassifier(max_depth=20,max_features=7,n_estimators=140,random_state=42)]
scorers=[accuracy_score,precision_score,recall_score,f1_score]



for model in models:
    print(f"Для модели {model}:")
    for scorer in scorers:
        score=np.mean(cross_val_score(model,X,y,scoring=make_scorer(scorer),cv=strat_cv))
        print(f'{scorer.__name__} = {score}')

    print('---------------------------')

Для модели RandomForestClassifier(max_depth=20, max_features=7, n_estimators=140,
                       random_state=42):
accuracy_score = 0.7564892623716153
precision_score = 0.6666830983601171
recall_score = 0.6083158630328441
f1_score = 0.635573391973612
---------------------------


### Подбор параметров RandomSearch

In [25]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from time import time


param_list={
    'max_depth' : randint(2,30),
    'max_features' : randint(2,20),
    'n_estimators' : randint(20,300)

}
start_time=time()
model=RandomForestClassifier(random_state=42)
searched_model = RandomizedSearchCV(model, param_list, random_state=42, scoring='recall',n_iter=80)
searched_model.fit(X,y)

model= RandomForestClassifier(**searched_model.best_params_,random_state=42)
print(f"Для модели c RandomizedSearchCV:")
for scorer in scorers:
    score=np.mean(cross_val_score(model,X,y,scoring=make_scorer(scorer),cv=strat_cv))
    print(f'{scorer.__name__} = {score}')

end_time=time()
total_time=end_time-start_time

print('---------------------------')
print(f'Для оптимизированной модели время обучения:\n{total_time}')


Для модели c RandomizedSearchCV:
accuracy_score = 0.7460487225193108
precision_score = 0.6522797558739807
recall_score = 0.593361285814116
f1_score = 0.6204379639259123
---------------------------
Для оптимизированной модели время обучения:
139.09832191467285


Как видим, мы получили худший результат если перебираем параметры по отдельности, чем перебирая параметры совместно

### Подбор параметров Hyperopt

In [48]:
from hyperopt import fmin, tpe, hp, Trials
from hyperopt import rand

start_time=time()

space = {
    'max_depth': hp.choice('max_depth', range(2, 30)),
    'n_estimators': hp.choice('n_estimators', range(20, 300)),
    'max_features' : hp.choice('max_features', range(2,20))
}


def find_recall(params):
    searched_model = RandomForestClassifier(**params,random_state=42)
    recall = np.mean(cross_val_score(searched_model,X,y,scoring='recall',cv=strat_cv))
    return -recall  

trials=Trials()
best_params = fmin(fn=find_recall,
            space=space,
            algo=tpe.suggest,  
            max_evals=80,
            trials=trials)


model=RandomForestClassifier(**best_params,random_state=42)

print(f"Для модели c TPE Hyperopt:")
for scorer in scorers:
    score=np.mean(cross_val_score(model,X,y,scoring=make_scorer(scorer),cv=strat_cv))
    print(f'{scorer.__name__} = {score}')

end_time=time()
total_time=end_time-start_time

print('---------------------------')
print(f'Для Hyperopt время обучения:\n{total_time}')

100%|██████████| 80/80 [01:56<00:00,  1.45s/trial, best loss: -0.6380852550663871]
Для модели c TPE Hyperopt:
accuracy_score = 0.7603938545115015
precision_score = 0.6781515988037727
recall_score = 0.6009084556254367
f1_score = 0.6360021436723111
---------------------------
Для Hyperopt время обучения:
123.51328444480896


Вывод: подбор параметров по одному дает хороший результат, но тратит большое количество времени. Используя Randomsearchcv результат уудшился, зато мы получили его быстрее. И наконец hyperopt при том же работает примерно с той же скоростью что и Randomsearchcv, но сохраняет хороший результат как при переборе по одному

## XGBClassifier

### Перебор по параметру вручную

In [29]:
from xgboost import XGBClassifier
model=XGBClassifier(max_depth=20,max_features=7,n_estimators=35,random_state=42)

print(f"С подбором гиперпараметров по отдельности:")
for scorer in scorers:
    score=np.mean(cross_val_score(model,X,y,scoring=make_scorer(scorer),cv=strat_cv))
    print(f'{scorer.__name__} = {score}')

print('---------------------------')

С подбором гиперпараметров по отдельности:
accuracy_score = 0.743425855190561
precision_score = 0.6364606953892669
recall_score = 0.6153738644304683
f1_score = 0.6253315910183403
---------------------------


### С помощью RandomSearch

In [30]:
param_list={
    'max_depth' : randint(2,40),
    'max_features' : randint(1,20),
    'n_estimators' : randint(10,300)

}
start_time=time()
model=XGBClassifier(random_state=42)
searched_model = RandomizedSearchCV(model, param_list, random_state=42, scoring='recall',n_iter=80)
searched_model.fit(X,y)

model= XGBClassifier(**searched_model.best_params_,random_state=42)
print(f"Для модели c RandomizedSearchCV:")
for scorer in scorers:
    score=np.mean(cross_val_score(model,X,y,scoring=make_scorer(scorer),cv=strat_cv))
    print(f'{scorer.__name__} = {score}')

end_time=time()
total_time=end_time-start_time

print('---------------------------')
print(f'Для оптимизированной модели время обучения:\n{total_time}')


Для модели c RandomizedSearchCV:
accuracy_score = 0.7577879636703166
precision_score = 0.6613972905401477
recall_score = 0.6268343815513627
f1_score = 0.6433745635784605
---------------------------
Для оптимизированной модели время обучения:
39.435508251190186


In [49]:

model=XGBClassifier(random_state=42)
start_time=time()


space = {
    'max_depth': hp.choice('max_depth', list(range(2, 30))),
    'n_estimators': hp.choice('n_estimators', list(range(20, 300))),
}


def find_recall(params):
    current_params = params.copy()
    searched_model = XGBClassifier(**current_params,random_state=42)
    recall = np.mean(cross_val_score(searched_model,X,y,scoring='recall',cv=strat_cv))
    return -recall 

trials=Trials()
best_params = fmin(fn=find_recall,
            space=space,
            algo=tpe.suggest,  
            max_evals=80,
            trials=trials)


model=XGBClassifier(**best_params,random_state=42)

print(f"Для модели c TPE Hyperopt:")
for scorer in scorers:
    score=np.mean(cross_val_score(model,X,y,scoring=make_scorer(scorer),cv=strat_cv))
    print(f'{scorer.__name__} = {score}')

end_time=time()
total_time=end_time-start_time

print('---------------------------')
print(f'Для Hyperopt время обучения:\n{total_time}')

100%|██████████| 80/80 [00:33<00:00,  2.39trial/s, best loss: -0.6377358490566037]
Для модели c TPE Hyperopt:
accuracy_score = 0.7460572107630932
precision_score = 0.640738866396761
recall_score = 0.6229909154437456
f1_score = 0.6312529050375886
---------------------------
Для Hyperopt время обучения:
34.26271390914917


В случае с XGBClassifier наилучший результат показал Randomsearchcv