# Домашнее задание к занятию
## "Улучшение качества модели"
Взять boston house-prices datase (sklearn.datasets.load_boston) и сделать тоже самое для задачи регрессии (попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество)

In [1]:
# Заружаем необходимые библиотеки
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Загружаем датасет
from sklearn.datasets import load_boston
data = pd.read_csv(load_boston()['filename'], skiprows=1)
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [3]:
# Получаем информацию о содержимом датасета:
print(load_boston()['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
# Получаем основную информацию о данных в датасете
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


Пропуски в данных отсутствуют. Типы переменных совпадают с теми даными, которые они содержат

In [5]:
# Разделим наши данные на обучающие и тестовые
X = data[data.columns[:-1]]
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [6]:
# Построим модель линейной регрессии 
model_1 = LinearRegression()
model_1.fit(X_train, y_train)
model_1.score(X_test,y_test)

0.7112260057484923

Качество модели недстаточно хорошее. Улучшим качество работы модели посредством подбора гиперпараметров.

### GridSearchCV

In [7]:
# Создаем список моделей и их параметров
models = [
    {'name': 'LinearRegression', 
     'model': LinearRegression(), 
     'params': {'fit_intercept': (True, False),
                'normalize': (True, False),
                'copy_X': (True, False),
                'n_jobs': (1, -1)}},
    
    {'name': 'KNeighborsRegressor', 
     'model': KNeighborsRegressor(), 
     'params': {'n_neighbors': range (1, 15, 2),
                'weights': ('uniform', 'distance'),
                'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
                'n_jobs': (1, -1)}},
    
    {'name': 'DecisionTreeRegressor', 
     'model': DecisionTreeRegressor(), 
     'params': {'criterion': ('squared_error', 'friedman_mse', 'absolute_error', 'poisson'),
                'splitter': ('best', 'random'),
                'max_depth': range(1, 15, 2),
                'min_samples_split': range(1, 15, 2)}}
]

In [8]:
# Подбираем параметры для модели
res_1 = []
for item in models:
    res_1.append((item['name'], GridSearchCV(item['model'], item['params'], cv=10).fit(X_train, y_train)))

In [9]:
# Выводим результаты
for item in res_1:
    print(item[0], item[1].best_score_, item[1].best_params_)

LinearRegression 0.6875346951141132 {'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'normalize': True}
KNeighborsRegressor 0.5043081561484843 {'algorithm': 'auto', 'n_jobs': 1, 'n_neighbors': 9, 'weights': 'distance'}
DecisionTreeRegressor 0.793036968798384 {'criterion': 'friedman_mse', 'max_depth': 11, 'min_samples_split': 13, 'splitter': 'random'}


Лучшее качество показала модель DecisionTreeRegressor с параметрами 'criterion': 'friedman_mse', 'max_depth': 11, 'min_samples_split': 13, 'splitter': 'random'.

### RandomizedSearch

In [10]:
# Создаем список моделей и их параметров
models = [
    {'name': 'LinearRegression', 
     'model': LinearRegression(), 
     'params': {'fit_intercept': (True, False),
                'normalize': (True, False),
                'copy_X': (True, False),
                'n_jobs': (1, -1)}},
    
    {'name': 'KNeighborsRegressor', 
     'model': KNeighborsRegressor(), 
     'params': {'n_neighbors': range (1, 15, 2),
                'weights': ('uniform', 'distance'),
                'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
                'n_jobs': (1, -1)}},
    
    {'name': 'DecisionTreeRegressor', 
     'model': DecisionTreeRegressor(), 
     'params': {'criterion': ('squared_error', 'friedman_mse', 'absolute_error', 'poisson'),
                'splitter': ('best', 'random'),
                'max_depth': range(1, 15, 2),
                'min_samples_split': range(1, 15, 2)}},
    
    {'name': 'Ridge', 
     'model': Ridge(), 
     'params': {'fit_intercept': (True, False),
                'max_iter': range(10, 1000, 20),
                'copy_X': (True, False),
                'solver': ('auto', 'svd', 'sparse_cg', 'saga', 'lbfgs', 'sag')}},
    
    {'name': 'Lasso', 
     'model': Lasso(), 
     'params': {'fit_intercept': (True, False),
                'max_iter': range(1, 1000, 20),
                'selection': ('cyclic', 'random'),
                'warm_start': (True, False)}},
    
    {'name': 'ElasticNet', 
     'model': ElasticNet(), 
     'params': {'fit_intercept': (True, False),
                'max_iter': range(1, 1000, 20),
                'selection': ('cyclic', 'random'),
                'positive': (True, False)}}
]

In [11]:
# Подбираем параметры для модели
res_2 = []
for item in models:
    res_2.append((item['name'], RandomizedSearchCV(item['model'], item['params'], cv=10).fit(X_train, y_train)))

In [12]:
# Выводим результаты
for item in res_2:
    print(item[0], item[1].best_score_, item[1].best_params_)

LinearRegression 0.6875346951141132 {'normalize': True, 'n_jobs': -1, 'fit_intercept': True, 'copy_X': True}
KNeighborsRegressor 0.5043081561484843 {'weights': 'distance', 'n_neighbors': 9, 'n_jobs': 1, 'algorithm': 'auto'}
DecisionTreeRegressor 0.5996839850852138 {'splitter': 'best', 'min_samples_split': 5, 'max_depth': 3, 'criterion': 'friedman_mse'}
Ridge 0.68515170909815 {'solver': 'auto', 'max_iter': 170, 'fit_intercept': True, 'copy_X': False}
Lasso 0.6519916489997861 {'warm_start': True, 'selection': 'random', 'max_iter': 781, 'fit_intercept': True}
ElasticNet 0.6594494465737253 {'selection': 'random', 'positive': False, 'max_iter': 81, 'fit_intercept': True}


Лучшее качество показала модель LinearRegression с параметрами 'normalize': True, 'n_jobs': -1, 'fit_intercept': True, 'copy_X': True.