In [32]:
from sklearn.datasets import load_boston

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

In [34]:
from sklearn.ensemble import BaggingRegressor#Беггинг
from sklearn.ensemble import GradientBoostingRegressor#Boosting
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

In [35]:
boston = load_boston()

In [36]:
# create X (features) and y (response)
X = boston.data
y = boston.target

In [22]:
from sklearn.model_selection import train_test_split

**Сделаем 20 разных train_test_split и посмотрим**

In [37]:
import numpy as np

In [38]:
#accuracy_score - Оценка качества модели для классификатора
#score - Оценка качества модели для регрессора
accuracy_count=0
accuracy_mean=0

for i in range(20):
    rand_state = np.random.randint(100) # Генерация случайного числа для random state до 100
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rand_state)
    model = KNeighborsRegressor()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = model.score(X_test, y_test) #Оценка качества модели
    print('№{} Accuracy: {}'.format(i, round(accuracy, 5)))
    accuracy_count+=accuracy
accuracy_mean=accuracy_count/20

print('Mean: '+' '+str(accuracy_mean))

№0 Accuracy: 0.56057
№1 Accuracy: 0.56913
№2 Accuracy: 0.51333
№3 Accuracy: 0.63483
№4 Accuracy: 0.68612
№5 Accuracy: 0.55535
№6 Accuracy: 0.48415
№7 Accuracy: 0.44404
№8 Accuracy: 0.54738
№9 Accuracy: 0.50557
№10 Accuracy: 0.40233
№11 Accuracy: 0.5284
№12 Accuracy: 0.4619
№13 Accuracy: 0.46164
№14 Accuracy: 0.58942
№15 Accuracy: 0.58704
№16 Accuracy: 0.46164
№17 Accuracy: 0.48415
№18 Accuracy: 0.5284
№19 Accuracy: 0.59723
Mean:  0.5301316538650295


**из-за большой волатильности делаем вывод о том, что модель очень зависит от обучающей выборки, что говорит о наличии большого стандартного отклонения (дисперсии), и подтверждает сложность выбранной модели, что может привести к переобучению (ложному предсказыванию)**

In [39]:
from sklearn.model_selection import cross_val_score

model = KNeighborsRegressor()
results = cross_val_score(model, X, y, cv=10, scoring="neg_mean_squared_error")

print(np.abs(results)**1/2)
print('Average result: {}'.format(round(np.mean(np.abs(results)**1/2), 3)))

[ 40.99031373  29.50787451 152.93625882  82.6134549   48.45563137
  62.27092157  20.750084    69.711508    16.098392    13.099756  ]
Average result: 53.643


**Подберем оптимальное количество соседей на основе cross_val_score**

In [40]:
best_param = (3, 0.0)

for n_neigh in range(3, 30):
    model = KNeighborsRegressor(n_neighbors=n_neigh)
    results = cross_val_score(model, X, y, cv=10)
    avg_average = np.mean(results)
    print('Neighbors count: {}\tAverage result: {}'.format(n_neigh, round(avg_average, 3)))
    
    if avg_average > 0.0:
        best_param = (n_neigh, avg_average)

print('\nBest n_neighbors is {}'.format(best_param[0]))


Neighbors count: 3	Average result: -4.669
Neighbors count: 4	Average result: -4.739
Neighbors count: 5	Average result: -4.949
Neighbors count: 6	Average result: -4.093
Neighbors count: 7	Average result: -3.699
Neighbors count: 8	Average result: -3.435
Neighbors count: 9	Average result: -3.263
Neighbors count: 10	Average result: -2.99
Neighbors count: 11	Average result: -2.816
Neighbors count: 12	Average result: -2.602
Neighbors count: 13	Average result: -2.557
Neighbors count: 14	Average result: -2.357
Neighbors count: 15	Average result: -2.273
Neighbors count: 16	Average result: -2.15
Neighbors count: 17	Average result: -2.067
Neighbors count: 18	Average result: -1.972
Neighbors count: 19	Average result: -1.902
Neighbors count: 20	Average result: -1.824
Neighbors count: 21	Average result: -1.786
Neighbors count: 22	Average result: -1.747
Neighbors count: 23	Average result: -1.706
Neighbors count: 24	Average result: -1.656
Neighbors count: 25	Average result: -1.613
Neighbors count: 26	

**RandomizedSearch**

In [41]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [42]:
models = []
names = ['LinearRegression','KNeighborsRegressor','BaggingRegressor','GradientBoostingRegressor','RandomForestRegressor','AdaBoostRegressor',]

models.append(LinearRegression(n_jobs=-1))
models.append(KNeighborsRegressor(n_jobs=-1))
models.append(BaggingRegressor())
models.append(GradientBoostingRegressor())
models.append(RandomForestRegressor())
models.append(AdaBoostRegressor())

In [43]:
params = {
    models[0]:{'normalize': ['True', 'False'], 'fit_intercept': ['True', 'False']},
    models[1]:{'n_neighbors': list(range(1, 31)),'weights': ['uniform', 'distance']},
    models[2]:{'n_estimators': list(range(1, 31))},
    models[3]:{'loss': ['ls','lad', 'huber'], 'learning_rate': [0, 0.1, 0.03, 0.5], 'max_depth': list(range(1, 30))},
    models[4]:{'n_estimators':list(range(10, 30)), 'max_depth': list(range(1, 31))},
    models[5]:{'learning_rate': list(np.arange(0.0,1.0,0.1)), 'loss':['linear', 'square', 'exponential']},
}


In [44]:
import warnings
warnings.filterwarnings('ignore')

for name, model in zip(names, models):
    rnd_search = RandomizedSearchCV(estimator=model, param_distributions=params[model], n_jobs=-1, cv=5)
    rnd_search.fit(X,y)
    
    print('_____________________________________________________')
    print('Классификатор: '+ str(rnd_search.best_estimator_))
    print('Лучшие параметры: '+ str(rnd_search.best_params_))
    print('Лучшая оценка: '+ str(rnd_search.best_score_))
    print('_____________________________________________________')

_____________________________________________________
Классификатор: LinearRegression(fit_intercept='True', n_jobs=-1, normalize='True')
Лучшие параметры: {'normalize': 'True', 'fit_intercept': 'True'}
Лучшая оценка: 0.35327592439588323
_____________________________________________________
_____________________________________________________
Классификатор: KNeighborsRegressor(n_jobs=-1, n_neighbors=11)
Лучшие параметры: {'weights': 'uniform', 'n_neighbors': 11}
Лучшая оценка: -0.30864691542964773
_____________________________________________________
_____________________________________________________
Классификатор: BaggingRegressor(n_estimators=12)
Лучшие параметры: {'n_estimators': 12}
Лучшая оценка: 0.6184500849645376
_____________________________________________________
_____________________________________________________
Классификатор: GradientBoostingRegressor(learning_rate=0.5, loss='lad', max_depth=1)
Лучшие параметры: {'max_depth': 1, 'loss': 'lad', 'learning_rate': 0.5}
Лу

In [45]:
params[models[3]]

{'loss': ['ls', 'lad', 'huber'],
 'learning_rate': [0, 0.1, 0.03, 0.5],
 'max_depth': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29]}

**GridSearchCV**

In [47]:
grid = GridSearchCV(GradientBoostingRegressor(), param_grid=params[models[3]], cv=10, n_jobs=-1)
grid.fit(X, y)

GridSearchCV(cv=10, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0, 0.1, 0.03, 0.5],
                         'loss': ['ls', 'lad', 'huber'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29]})

In [48]:
for k in grid.cv_results_:
    print(k, ":", grid.cv_results_[k][0])

mean_fit_time : 0.0006798982620239257
std_fit_time : 0.0001280066242736464
mean_score_time : 0.0
std_score_time : 0.0
param_learning_rate : 0
param_loss : ls
param_max_depth : 1
params : {'learning_rate': 0, 'loss': 'ls', 'max_depth': 1}
split0_test_score : nan
split1_test_score : nan
split2_test_score : nan
split3_test_score : nan
split4_test_score : nan
split5_test_score : nan
split6_test_score : nan
split7_test_score : nan
split8_test_score : nan
split9_test_score : nan
mean_test_score : nan
std_test_score : nan
rank_test_score : 348


In [49]:
grid = GridSearchCV(RandomForestRegressor(), param_grid=params[models[4]], cv=10, n_jobs=-1)
grid.fit(X, y)

GridSearchCV(cv=10, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30],
                         'n_estimators': [10, 11, 12, 13, 14, 15, 16, 17, 18,
                                          19, 20, 21, 22, 23, 24, 25, 26, 27,
                                          28, 29]})

In [50]:
for k in grid.cv_results_:
    print(k, ":", grid.cv_results_[k][0])

mean_fit_time : 0.03141987323760986
std_fit_time : 0.005278869027659118
mean_score_time : 0.0031005144119262695
std_score_time : 0.001933485766535094
param_max_depth : 1
param_n_estimators : 10
params : {'max_depth': 1, 'n_estimators': 10}
split0_test_score : 0.551279931354013
split1_test_score : 0.6316236844450525
split2_test_score : -0.48133407642618864
split3_test_score : 0.29453401708273275
split4_test_score : 0.3208956282522022
split5_test_score : -0.26879952564908227
split6_test_score : 0.03315607851066904
split7_test_score : -0.14086552280692577
split8_test_score : -3.5974015688477037
split9_test_score : -0.06311601444619575
mean_test_score : -0.2720027368531427
std_test_score : 1.1585718459536072
rank_test_score : 598


**OOB оценка и cross-validation**

In [61]:
regressor = RandomForestRegressor( n_estimators=20, max_depth=5, n_jobs=-1)
regressor.fit(X_train, y_train)
results_cross = cross_val_score(regressor, X, y, cv=10)

In [73]:
regressor_oob = RandomForestRegressor( oob_score=True,n_estimators=20, max_depth=5, n_jobs=-1)
regressor_oob.fit(X_train, y_train)

RandomForestRegressor(max_depth=5, n_estimators=20, n_jobs=-1, oob_score=True)

In [82]:
print('OOB'+ ' ' + str(regressor_oob.oob_score_))
print('Cross'+ ' ' + str(np.mean(results)))
print('Score'+ ' ' + str(regressor.score(X_test, y_test)))

OOB 0.8154516333238006
Cross 0.43042844787868245
Score 0.8924004833348245
