# Взять boston house-prices datase (sklearn.datasets.load_boston) и сделать тоже самое для задачи регрессии (попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import load_boston

data = load_boston()

In [3]:
print(data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
df = pd.DataFrame(data = np.c_[data['data'], data['target']],
                     columns = list(data['feature_names']) + ['MEDV'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_val, y_train, y_val = train_test_split(df.iloc[:,0:-1],df.iloc[:,-1], test_size=0.2, random_state=10)

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

### (попробовать разные алгоритмы, поподбирать параметры)

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
from sklearn import metrics
metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [12]:
from sklearn.linear_model import LinearRegression

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [13]:
params_lr = {
    'fit_intercept' : [False, True],
    'normalize' : [False, True]
    }

In [14]:
grid_lr = GridSearchCV(LinearRegression(n_jobs=-1), param_grid=params_lr, cv=5, scoring='neg_mean_absolute_error')
grid_lr.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LinearRegression(n_jobs=-1),
             param_grid={'fit_intercept': [False, True],
                         'normalize': [False, True]},
             scoring='neg_mean_absolute_error')

In [15]:
print(grid_lr.best_params_)
print(grid_lr.best_score_)

{'fit_intercept': True, 'normalize': False}
-3.228030667863712


In [16]:
from sklearn.tree import DecisionTreeRegressor

# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

In [17]:
params_dt = {
    'criterion' : ['mse', 'friedman_mse', 'mae'], 
    'splitter' : ['best', 'random'],
    'max_features' : ['auto', 'sqrt', 'log2'],
    'max_depth' : list(range(1,20))
}

In [18]:
grid_dt = GridSearchCV(DecisionTreeRegressor(), param_grid=params_dt, cv=5, scoring='neg_mean_absolute_error')
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['mse', 'friedman_mse', 'mae'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             scoring='neg_mean_absolute_error')

In [19]:
print(grid_dt.best_params_)
print(grid_dt.best_score_)

{'criterion': 'mae', 'max_depth': 5, 'max_features': 'auto', 'splitter': 'best'}
-2.7118765432098764


In [20]:
from sklearn.ensemble import RandomForestRegressor

# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [21]:
params_rf = {
    'n_estimators' : [1000,5000,10000],
    'max_depth' : list(range(5,15,5)),
    'max_features' : ['auto', 'sqrt', 'log2']
}

In [22]:
grid_rf = GridSearchCV(RandomForestRegressor(n_jobs=-1), param_grid=params_rf, cv=5, scoring='neg_mean_absolute_error')
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=-1),
             param_grid={'max_depth': [5, 10],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [1000, 5000, 10000]},
             scoring='neg_mean_absolute_error')

In [23]:
print(grid_rf.best_params_)
print(grid_rf.best_score_)

{'max_depth': 10, 'max_features': 'log2', 'n_estimators': 1000}
-2.16800060585317


In [24]:
from sklearn.neighbors import KNeighborsRegressor

# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html

In [25]:
params_knn = {
    'n_neighbors' : list(range(3,15)),
    'weights' : ['uniform','distance'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [26]:
grid_knn = GridSearchCV(KNeighborsRegressor(n_jobs=-1), param_grid=params_knn, cv=5, scoring='neg_mean_absolute_error')
grid_knn.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(n_jobs=-1),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                         14],
                         'weights': ['uniform', 'distance']},
             scoring='neg_mean_absolute_error')

In [27]:
print(grid_knn.best_params_)
print(grid_knn.best_score_)

{'algorithm': 'auto', 'n_neighbors': 6, 'weights': 'distance'}
-2.6299206361848406


In [28]:
from sklearn.linear_model import SGDRegressor

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html

In [47]:
params_sgd = {
    'penalty' : ['l2','l1','elasticnet'],
    'max_iter' : [50,100,300,700,1000,5000],
    'alpha' : [0.0001, 0.01, 0.1,0.2]
}

In [48]:
grid_sgd = GridSearchCV(SGDRegressor(), param_grid=params_sgd, cv=5, scoring='neg_mean_absolute_error')
grid_sgd.fit(X_train, y_train)



GridSearchCV(cv=5, estimator=SGDRegressor(),
             param_grid={'alpha': [0.0001, 0.01, 0.1, 0.2],
                         'max_iter': [50, 100, 300, 700, 1000, 5000],
                         'penalty': ['l2', 'l1', 'elasticnet']},
             scoring='neg_mean_absolute_error')

In [49]:
print(grid_sgd.best_params_)
print(grid_sgd.best_score_)

{'alpha': 0.1, 'max_iter': 50, 'penalty': 'l2'}
-3.1399413856323357


In [50]:
from sklearn.svm import SVR

# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [55]:
params_svr = {
    'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
}

In [56]:
grid_svr = GridSearchCV(SVR(), param_grid=params_svr, cv=5, scoring='neg_mean_absolute_error')
grid_svr.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             scoring='neg_mean_absolute_error')

In [57]:
print(grid_svr.best_params_)
print(grid_svr.best_score_)

{'kernel': 'linear'}
-3.0272902454368262


### (вывести итоговое качество)

In [70]:
print(grid_lr.estimator, '\t', grid_lr.score(X_val, y_val))
print(grid_dt.estimator, '\t', grid_dt.score(X_val, y_val))
print(grid_rf.estimator, '\t', grid_rf.score(X_val, y_val))
print(grid_knn.estimator, '\t', grid_knn.score(X_val, y_val))
print(grid_sgd.estimator, '\t', grid_sgd.score(X_val, y_val))
print(grid_svr.estimator, '\t', grid_svr.score(X_val, y_val))

LinearRegression(n_jobs=-1) 	 -4.061419182954706
DecisionTreeRegressor() 	 -3.3313725490196076
RandomForestRegressor(n_jobs=-1) 	 -2.8068578436642366
KNeighborsRegressor(n_jobs=-1) 	 -3.421233237828769
SGDRegressor() 	 -4.051379262038763
SVR() 	 -3.964104593959291
