Для выполнения домашнего задания необходимо взять boston house-prices datase (sklearn.datasets.load_boston) 
и сделать тоже самое для задачи регрессии
(попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество).

In [25]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

In [26]:

import warnings
warnings.filterwarnings("ignore")

In [27]:
# set constants
RANDOM_STATE = 777
N_FOLDS = 5

In [42]:
df = pd.read_csv(load_boston()['filename'], skiprows=1)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [29]:
X, y = df['data'], df['target']

In [30]:
X.shape, y.shape

((506, 13), (506,))

In [31]:
# train and apply standartscaler to X
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [32]:
param_rf = dict(max_depth=[3,5,7,10,12,15,20], n_estimators=[5,10,15,20,25,50,100])

In [33]:
rand = RandomizedSearchCV(RandomForestRegressor(), param_rf, cv=10, n_iter=10, scoring="neg_mean_squared_error", random_state=RANDOM_STATE)
rand.fit(X, y)

RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(),
                   param_distributions={'max_depth': [3, 5, 7, 10, 12, 15, 20],
                                        'n_estimators': [5, 10, 15, 20, 25, 50,
                                                         100]},
                   random_state=777, scoring='neg_mean_squared_error')

In [34]:
print(rand.cv_results_['mean_test_score'])

[-24.19282831 -22.03404465 -23.39478862 -21.19405882 -23.84578484
 -23.55170664 -22.96577535 -23.71971098 -23.70960343 -21.80342328]


In [35]:
print(rand.cv_results_['params'])

[{'n_estimators': 5, 'max_depth': 12}, {'n_estimators': 15, 'max_depth': 5}, {'n_estimators': 50, 'max_depth': 5}, {'n_estimators': 50, 'max_depth': 7}, {'n_estimators': 20, 'max_depth': 12}, {'n_estimators': 15, 'max_depth': 7}, {'n_estimators': 15, 'max_depth': 12}, {'n_estimators': 25, 'max_depth': 5}, {'n_estimators': 5, 'max_depth': 10}, {'n_estimators': 20, 'max_depth': 20}]


In [36]:
# examine the best model
print(rand.best_score_)
print(rand.best_params_)
print(rand.best_estimator_)

-21.1940588248608
{'n_estimators': 50, 'max_depth': 7}
RandomForestRegressor(max_depth=7, n_estimators=50)


In [None]:
Теперь мы можем использовать лучшие параметры из случайного поиска для детального поиска по сетке.

In [37]:
def get_score(X, y, model, grid_params, n_folds = N_FOLDS):
    ''' Function to train input model with grid_params '''
    # define grid with cross-validation
    gridsearch = GridSearchCV(model, grid_params, scoring='neg_mean_squared_error', cv=n_folds, n_jobs=-1)
    # fit grid
    gridsearch.fit(X, y)
    # get score of best model
    l_score = gridsearch.best_score_
    print(gridsearch.best_estimator_,'\nScore=',l_score)
    print('*'*50)
    return str(gridsearch.best_estimator_), l_score

In [38]:
test_score = get_score(X, y, LinearRegression(n_jobs=-1), {'normalize': [True,False]})

LinearRegression(n_jobs=-1) 
Score= -37.131807467698884
**************************************************


In [39]:
models = [(LinearRegression(n_jobs=-1), {'normalize': [True,False]}),
          (RandomForestRegressor(n_jobs=-1,random_state=RANDOM_STATE), {'max_depth': [11,12,13,15], 'n_estimators': [15,20,25,50], 'max_features': [0.2,0.5,0.7,0.8]}),
          (CatBoostRegressor(loss_function='RMSE',random_state=RANDOM_STATE, silent=True), {'depth': [5,6,7,8,10], 'learning_rate': [0.01, 0.05, 0.1], 'iterations': [20, 50, 100]})
         ]

In [40]:
models_score = {}
for mdl, params in models:
    m, s = get_score(X, y, mdl, params)
    models_score[m] = s

LinearRegression(n_jobs=-1) 
Score= -37.131807467698884
**************************************************
RandomForestRegressor(max_depth=13, max_features=0.5, n_estimators=50,
                      n_jobs=-1, random_state=777) 
Score= -19.481741166537574
**************************************************
<catboost.core.CatBoostRegressor object at 0x000001C76D0CF3D0> 
Score= -20.59303588619283
**************************************************


лучшая модель 

In [41]:
models_score

{'LinearRegression(n_jobs=-1)': -37.131807467698884,
 'RandomForestRegressor(max_depth=13, max_features=0.5, n_estimators=50,\n                      n_jobs=-1, random_state=777)': -19.481741166537574,
 '<catboost.core.CatBoostRegressor object at 0x000001C76D0CF3D0>': -20.59303588619283}