# Интеграция. Итоговый проект.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Создание модели

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix

import time
import pickle

In [4]:
def corr_matrix(data, features):
    corr_matrix = np.round(data.loc[:, features].corr(), 2)
    corr_matrix[np.abs(corr_matrix) < 0.1] = 0
    
    plt.figure(figsize = (9,7))
    sns.heatmap(corr_matrix, annot=True, linewidths=.5, cmap='GnBu')
    plt.title('Correlation matrix')
    plt.show()

## EDA

Используются данные из соревнования kaggle https://www.kaggle.com/code/boopathymsse/income-prediction

Смысл задачи: по косвенным признакам спрогнозировать возмжоность дохода более 50 000.

In [7]:
DATA_PATH = "/content/drive/MyDrive/geekbrain/MLBuisness/lesson9/data/train.csv"

In [8]:
df = pd.read_csv(DATA_PATH)
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1


In [9]:
df.dropna(inplace=True)

In [10]:
X = df[['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']]
y = df['income_>50K']

In [11]:
num_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']

oe_features = ['education']
education_prio = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', \
                  'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', \
                  'Some-college', 'Bachelors', 'Masters', 'Doctorate']

ohe_features = ['workclass', 'marital-status', 'occupation', 'relationship', \
                'race', 'gender', 'native-country']

columns_with_nan = ['workclass', 'occupation', 'native-country']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

## Описание трансформатора(модификатора) данных с учетом особенностей столбцев

In [13]:
rs = RobustScaler()
oe = OrdinalEncoder(categories=[education_prio])
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

ct = make_column_transformer((rs, num_features),
                             (oe, oe_features),
                             (ohe, ohe_features),
                             remainder='passthrough')

## Создание экземпляра трансформатора

In [14]:
df_trans = ct.fit_transform(df)
df_trans = pd.DataFrame(df_trans)
df_trans



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
0,1.578947,1.557604,99999.0,0.0,4.0,15.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,16.0,1.0
1,-1.052632,0.549052,0.0,0.0,-5.0,7.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0
2,-0.315789,-0.033786,0.0,0.0,0.0,13.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0,1.0
3,1.105263,-0.563648,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0
4,-0.631579,-0.240368,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40722,0.789474,-0.904876,0.0,0.0,2.0,13.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0,1.0
40723,-0.947368,-0.510969,0.0,0.0,0.0,8.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0,0.0
40724,-0.368421,0.162803,0.0,0.0,3.6,12.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0,0.0
40725,0.473684,-0.665610,0.0,0.0,-1.0,13.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0,0.0


## Во время преобразований имена столбцов меняются, поэтому нам нужно выяснить, как были изменены имена

In [15]:
ct.transformers_

[('robustscaler',
  RobustScaler(),
  ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']),
 ('ordinalencoder',
  OrdinalEncoder(categories=[['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th',
                              '10th', '11th', '12th', 'HS-grad', 'Prof-school',
                              'Assoc-acdm', 'Assoc-voc', 'Some-college',
                              'Bachelors', 'Masters', 'Doctorate']]),
  ['education']),
 ('onehotencoder',
  OneHotEncoder(handle_unknown='ignore', sparse=False, sparse_output=False),
  ['workclass',
   'marital-status',
   'occupation',
   'relationship',
   'race',
   'gender',
   'native-country']),
 ('remainder', 'passthrough', [4, 14])]

In [16]:
ct.transformers_[0][1].get_feature_names_out()

array(['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'],
      dtype=object)

In [17]:
ct.transformers_[1][1]

In [18]:
ct.transformers_[2][1].get_feature_names_out()

array(['workclass_Federal-gov', 'workclass_Local-gov',
       'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov',
       'workclass_Without-pay', 'marital-status_Divorced',
       'marital-status_Married-AF-spouse',
       'marital-status_Married-civ-spouse',
       'marital-status_Married-spouse-absent',
       'marital-status_Never-married', 'marital-status_Separated',
       'marital-status_Widowed', 'occupation_Adm-clerical',
       'occupation_Armed-Forces', 'occupation_Craft-repair',
       'occupation_Exec-managerial', 'occupation_Farming-fishing',
       'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
       'occupation_Other-service', 'occupation_Priv-house-serv',
       'occupation_Prof-specialty', 'occupation_Protective-serv',
       'occupation_Sales', 'occupation_Tech-support',
       'occupation_Transport-moving', 'relationship_Husband',
       'relationship_Not-in-family', 'relationship_Other-relative

## Получилось около 90 столбцов. Полную коррекляционную матрицу увидеть и проанализировать нереально. Поэтому на экран выведу корреляцию с целевым значением только некоторых показателей.

In [19]:
corr_matrix = np.round(df_trans.loc[:, df_trans.columns].corr(), 2)
corr_matrix[np.abs(corr_matrix) < 0.1] = 0
corr_matrix = pd.DataFrame(corr_matrix)

# corr_matrix[89].loc[corr_matrix[89] != 0]

In [20]:
cin=pd.DataFrame(corr_matrix[89].loc[corr_matrix[89] != 0])
cin = cin.rename(columns={89: 'corr coeff'})

In [21]:
def get_names(ct, trans):
    # >> Original get_feature_names() method
    if trans == 'drop' or (
            hasattr(column, '__len__') and not len(column)):
        return []
    if trans == 'passthrough':
        if hasattr(ct, '_df_columns'):
            if ((not isinstance(column, slice))
                    and all(isinstance(col, str) for col in column)):
                return column
            else:
                return ct._df_columns[column]
        else:
            indices = np.arange(ct._n_features)
            return ['x%d' % i for i in indices[column]]
    if not hasattr(trans, 'get_feature_names'):
        if column is None:
            return []
        else:
            return [#name + "__" + 
                    f for f in column]

    return [#name + "__" + 
            f for f in trans.get_feature_names_out()]

feature_names = []

l_transformers = list(ct._iter(fitted=True))
cnt = 0
for name, trans, column, _ in l_transformers:
  _names = get_names(ct, trans)

  feature_names.extend(_names)
  cnt += 1 

fn = pd.DataFrame(feature_names)

In [22]:
cin['column name'] = fn[0]

In [23]:
cin

Unnamed: 0,corr coeff,column name
0,0.24,age
2,0.22,capital-gain
3,0.15,capital-loss
4,0.23,hours-per-week
5,0.26,education
8,-0.12,occupation
9,0.14,relationship
13,-0.13,x4
15,0.45,
17,-0.32,


## Очистка данных после корреляционного анализа

In [24]:
columns_to_drop = ['fnlwgt', 'educational_num', 'workclass', 'race', 'native_country', 'gender']
columns_ = [_.replace('-', '_') for _ in list(df.columns)]
columns = dict(zip(list(df.columns), columns_))
df.rename(columns=columns, inplace=True)
df_new = df.drop(columns=columns_to_drop)
df_new

Unnamed: 0,age,education,marital_status,occupation,relationship,capital_gain,capital_loss,hours_per_week,income_>50K
0,67,Doctorate,Divorced,Exec-managerial,Not-in-family,99999,0,60,1
1,17,12th,Never-married,Other-service,Own-child,0,0,15,0
2,31,Bachelors,Married-civ-spouse,Exec-managerial,Husband,0,0,40,1
3,58,7th-8th,Married-civ-spouse,Transport-moving,Husband,0,0,40,0
4,25,Some-college,Never-married,Other-service,Not-in-family,0,0,40,0
...,...,...,...,...,...,...,...,...,...
43952,52,Bachelors,Married-civ-spouse,Exec-managerial,Husband,0,0,50,1
43953,19,HS-grad,Never-married,Other-service,Own-child,0,0,40,0
43954,30,Some-college,Divorced,Sales,Not-in-family,0,0,58,0
43955,46,Bachelors,Never-married,Sales,Not-in-family,0,0,35,0


In [25]:
X_new = df_new.loc[:, df_new.columns[:-1]]
y_new = df_new['income_>50K']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, random_state=42, test_size=0.2)

## Попробуем применить GradientBoosting к откорректированным данным

In [27]:
ct = make_column_transformer((rs, ['age', 'capital_gain', 'capital_loss', 'hours_per_week']),
                             (oe, ['education']),
                             (ohe, ['marital_status', 'occupation', 'relationship']),
                             remainder='passthrough')

gbc = GradientBoostingClassifier(random_state=42)

pipe = make_pipeline(ct, gbc)
pipe.fit(X_train, y_train);



In [28]:
cv_score = cross_val_score(pipe, X_test, y_test, cv=3, scoring='f1_weighted')
cv_score.mean()



0.8472658491187609

## Подберем оптимальные гиперпараметры

In [29]:
params={'gradientboostingclassifier__learning_rate':[0.1, 0.05, 0.01],
        'gradientboostingclassifier__n_estimators':[200],
        'gradientboostingclassifier__min_samples_leaf':[1, 5, 10],
        'gradientboostingclassifier__max_depth':[3, 5, 7]
        }

grid = GridSearchCV(pipe,
                    param_grid=params,
                    cv=2,
                    refit=False,
                    verbose=27
                   )

t1 = time.time()
search = grid.fit(X_new, y_new)
t2 = time.time()

results = search.best_params_

print(f'Model tuning took {t2-t1} sec\n\nResults:')
for parameter in results:
    print(f'{parameter}: {results[parameter]}')

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2; 1/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 1/2; 1/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.864 total time=   5.9s
[CV 2/2; 1/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 2/2; 1/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.868 total time=   5.6s
[CV 1/2; 2/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 1/2; 2/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.864 total time=   5.1s
[CV 2/2; 2/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 2/2; 2/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.867 total time=   6.4s
[CV 1/2; 3/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 1/2; 3/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.864 total time=   5.0s
[CV 2/2; 3/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 2/2; 3/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.867 total time=   5.5s
[CV 1/2; 4/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 1/2; 4/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.865 total time=   8.7s
[CV 2/2; 4/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 2/2; 4/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.867 total time=   9.0s
[CV 1/2; 5/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 1/2; 5/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.866 total time=   7.8s
[CV 2/2; 5/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 2/2; 5/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.867 total time=   9.2s
[CV 1/2; 6/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 1/2; 6/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.865 total time=   7.9s
[CV 2/2; 6/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 2/2; 6/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.868 total time=   8.8s
[CV 1/2; 7/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 1/2; 7/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.861 total time=  12.1s
[CV 2/2; 7/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 2/2; 7/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.863 total time=  11.9s
[CV 1/2; 8/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 1/2; 8/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.863 total time=  10.9s
[CV 2/2; 8/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 2/2; 8/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.862 total time=  11.3s
[CV 1/2; 9/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 1/2; 9/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.864 total time=  11.8s
[CV 2/2; 9/27] START gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 2/2; 9/27] END gradientboostingclassifier__learning_rate=0.1, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.865 total time=  11.6s
[CV 1/2; 10/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 1/2; 10/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.860 total time=   4.9s
[CV 2/2; 10/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 2/2; 10/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.865 total time=   6.2s
[CV 1/2; 11/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 1/2; 11/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.860 total time=   5.4s
[CV 2/2; 11/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 2/2; 11/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.865 total time=   5.1s
[CV 1/2; 12/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 1/2; 12/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.861 total time=   6.4s
[CV 2/2; 12/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 2/2; 12/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.865 total time=   5.1s
[CV 1/2; 13/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 1/2; 13/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.865 total time=   9.2s
[CV 2/2; 13/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 2/2; 13/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.869 total time=   7.6s
[CV 1/2; 14/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 1/2; 14/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.866 total time=   9.1s
[CV 2/2; 14/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 2/2; 14/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.868 total time=   8.1s
[CV 1/2; 15/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 1/2; 15/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.866 total time=   8.5s
[CV 2/2; 15/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 2/2; 15/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.868 total time=   9.0s
[CV 1/2; 16/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 1/2; 16/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.864 total time=  11.0s
[CV 2/2; 16/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 2/2; 16/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.866 total time=  11.6s
[CV 1/2; 17/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 1/2; 17/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.865 total time=  11.7s
[CV 2/2; 17/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 2/2; 17/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.867 total time=  11.7s
[CV 1/2; 18/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 1/2; 18/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.866 total time=  11.6s
[CV 2/2; 18/27] START gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 2/2; 18/27] END gradientboostingclassifier__learning_rate=0.05, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.867 total time=  10.4s
[CV 1/2; 19/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 1/2; 19/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.844 total time=   6.1s
[CV 2/2; 19/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 2/2; 19/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.845 total time=   5.0s
[CV 1/2; 20/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 1/2; 20/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.844 total time=   6.1s
[CV 2/2; 20/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 2/2; 20/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.845 total time=   5.2s
[CV 1/2; 21/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 1/2; 21/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.844 total time=   5.0s
[CV 2/2; 21/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 2/2; 21/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=3, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.845 total time=   6.4s
[CV 1/2; 22/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 1/2; 22/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.854 total time=   7.4s
[CV 2/2; 22/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 2/2; 22/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.860 total time=   8.8s
[CV 1/2; 23/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 1/2; 23/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.855 total time=   7.8s
[CV 2/2; 23/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 2/2; 23/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.860 total time=   8.6s
[CV 1/2; 24/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 1/2; 24/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.854 total time=   8.4s
[CV 2/2; 24/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 2/2; 24/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=5, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.861 total time=   7.8s
[CV 1/2; 25/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 1/2; 25/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.860 total time=  11.3s
[CV 2/2; 25/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200




[CV 2/2; 25/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=1, gradientboostingclassifier__n_estimators=200;, score=0.865 total time=  11.4s
[CV 1/2; 26/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 1/2; 26/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.859 total time=   9.8s
[CV 2/2; 26/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200




[CV 2/2; 26/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=5, gradientboostingclassifier__n_estimators=200;, score=0.864 total time=  11.2s
[CV 1/2; 27/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 1/2; 27/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.859 total time=  11.1s
[CV 2/2; 27/27] START gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200




[CV 2/2; 27/27] END gradientboostingclassifier__learning_rate=0.01, gradientboostingclassifier__max_depth=7, gradientboostingclassifier__min_samples_leaf=10, gradientboostingclassifier__n_estimators=200;, score=0.864 total time=  11.1s
Model tuning took 456.02128171920776 sec

Results:
gradientboostingclassifier__learning_rate: 0.05
gradientboostingclassifier__max_depth: 5
gradientboostingclassifier__min_samples_leaf: 1
gradientboostingclassifier__n_estimators: 200


## Обучим модель для лучших значений гиперпараметров

In [30]:
gbc = GradientBoostingClassifier(random_state=42, learning_rate=0.05, n_estimators=200, min_samples_leaf=1, max_depth=5)

pipe = make_pipeline(ct, gbc)
pipe.fit(X_train, y_train)

cv_score = cross_val_score(pipe, X_test, y_test, cv=3, scoring='f1_weighted')
cv_score.mean()



0.8471090916003515

## Сохранение модели в pickle формате

In [32]:
filename = 'model.pkl'
pickle.dump(pipe, open(filename, 'wb'))