Возьмем boston house-prices datase (sklearn.datasets.load_boston) и сделаем тоже самое для задачи регрессии (попробуем разные алгоритмы, поподбирем параметры, выведем итоговое качество)

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import GridSearchCV

In [2]:
df = load_boston()

In [3]:
X = pd.DataFrame(df.data, columns=df.feature_names)
y = pd.DataFrame(df.target)
y.columns = ['TARGET']
data_all = pd.concat([X, y], axis=1)

In [5]:
data_all.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,TARGET
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7


In [56]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) 

In [57]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)

In [58]:
X_train.shape[0] == y_train.shape[0]

True

### First model

In [46]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [49]:
max_depth = list(range(3, 31, 3))
min_samples_leaf = list(range(5, 31, 5))
params_grid = {'max_depth':max_depth, 'min_samples_leaf':min_samples_leaf}
rf_grid = GridSearchCV(rf, params_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

In [50]:
rf_grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [3, 6, 9, 12, 15, 18, 21, 24, 27, 30], 'min_samples_leaf': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [52]:
print (rf_grid.best_score_)
print (rf_grid.best_params_)
rf_grit_best = rf_grid.best_estimator_

-16.846233318680397
{'max_depth': 12, 'min_samples_leaf': 5}


In [62]:
from sklearn.metrics import mean_squared_error
y_pred = rf_grit_best.predict(X_valid)
mse_rf = mean_squared_error(y_valid, y_pred)
mse_rf

9.37817023941288

### Second model

In [33]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()

In [34]:
max_depth = [3, 4, 5, 7, 9, 12, 15] 
min_samples_leaf = [1, 2, 3, 5, 7, 10, 15]
params_grid = {'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf}
tree_grid = GridSearchCV(tree_reg, params_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

In [35]:
tree_grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [3, 4, 5, 7, 9, 12, 15], 'min_samples_leaf': [1, 2, 3, 5, 7, 10, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [36]:
print (tree_grid.best_score_)
print (tree_grid.best_params_)
tree_grit_best = tree_grid.best_estimator_

-18.193858464177815
{'max_depth': 7, 'min_samples_leaf': 1}


In [63]:
y_pred = tree_grit_best.predict(X_valid)
mse_tree = mean_squared_error(y_valid, y_pred)
mse_tree

21.124556621771973

### Third model

In [37]:
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_jobs=-1)

In [38]:
n_neighbours = [2, 3, 4, 5, 6, 7, 8, 9, 10]
weight = ['uniform', 'distance']
params_grid = {'n_neighbors': n_neighbours, 'weights': weight}
knn_grid = GridSearchCV(knn_reg, params_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

In [39]:
knn_grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [40]:
print (knn_grid.best_score_)
print (knn_grid.best_params_)
knn_grit_best = knn_grid.best_estimator_

-18.038387986616375
{'n_neighbors': 3, 'weights': 'distance'}


In [65]:
y_pred = knn_grit_best.predict(X_valid)
mse_knn = mean_squared_error(y_valid, y_pred)
mse_knn

3.8691500783614283

### Fourth model

In [41]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso()

In [42]:
alpha = [1, 0.1, 0.05, 0.01, 0.001]
params_grid = {'alpha': alpha}
lasso_grid = GridSearchCV(lasso_reg, params_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

In [43]:
lasso_grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'alpha': [1, 0.1, 0.05, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [44]:
print (lasso_grid.best_score_)
print (lasso_grid.best_params_)
lasso_grit_best = lasso_grid.best_estimator_

-25.368838657501346
{'alpha': 0.01}


In [66]:
y_pred = lasso_grit_best.predict(X_valid)
mse_lasso = mean_squared_error(y_valid, y_pred)
mse_lasso

21.035949454922903

### Fifth model

In [72]:
from sklearn.svm import SVR
svm = SVR()

In [105]:
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
C = [0.1, 1, 2, 3, 4, 5, 8, 10, 20, 40, 60, 100]
params_grid = {'kernel': kernel, 'C': C}
svm_grid = GridSearchCV(svm, params_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

In [106]:
svm_grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [0.01, 0.1, 1, 2, 3, 4, 5, 8, 10, 20, 40, 60, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [107]:
print (svm_grid.best_score_)
print (svm_grid.best_params_)
svm_grit_best = svm_grid.best_estimator_

-17.406838836684855
{'C': 40, 'kernel': 'rbf'}


In [108]:
y_pred = svm_grit_best.predict(X_valid)
mse_svm = mean_squared_error(y_valid, y_pred)
mse_svm

7.762522655451216

### Comparison results

In [109]:
print ('mse_rf ', mse_rf)
print ('mse_tree ', mse_tree)
print ('mse_knn ', mse_knn)
print ('mse_lasso ', mse_lasso)
print ('mse_svm ', mse_svm)

mse_rf  9.37817023941288
mse_tree  21.124556621771973
mse_knn  3.8691500783614283
mse_lasso  21.035949454922903
mse_svm  7.762522655451216
