# Modele de regresie: Boston housing

Sîrbu Matei Dan, _grupa 10LF383_

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

In [2]:
header = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data_housing = pd.read_csv("./Datasets/Housing/housing.data", names=header, sep='\s+')
display(HTML('<i>Boston housing dataset overview:</i>'))
display(data_housing)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [3]:
import sklearn
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.linear_model import Lasso, BayesianRidge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

X = data_housing[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']].to_numpy()
y = data_housing[['MEDV']].to_numpy().ravel()
score_tables = []

In [4]:
param_candidates = {'alpha': np.linspace(start=1e-5, stop=1, num=10000, dtype=float) } 

param_search = RandomizedSearchCV(estimator=Lasso(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3), n_iter=100)
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_1 = pd.DataFrame({'Model_name': np.repeat('Lasso', 5), 'Fold': np.arange(1, 6), 'Search_strategy': np.repeat('RandomizedSearchCV', 5)})
score_table_1 = pd.concat([score_table_1,pd.DataFrame(scores)], axis=1)
display(score_table_1)
score_tables.append(score_table_1)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,Lasso,1,RandomizedSearchCV,0.310655,0.000848,-3.035318,-3.843045,-15.064119,-29.79852,-2.538048,-2.749917
1,Lasso,2,RandomizedSearchCV,0.300539,0.000585,-4.086474,-3.588598,-33.261663,-27.136945,-2.481359,-2.573424
2,Lasso,3,RandomizedSearchCV,0.282761,0.000578,-5.185735,-3.460764,-53.226085,-26.244872,-3.494891,-2.375632
3,Lasso,4,RandomizedSearchCV,0.283419,0.000569,-5.436375,-2.751116,-77.188161,-13.71794,-3.063159,-2.171184
4,Lasso,5,RandomizedSearchCV,0.283513,0.000583,-4.385414,-3.360477,-27.992657,-23.781909,-4.28817,-2.289129


In [5]:
param_candidates = {'alpha_1': np.linspace(start=1e-6, stop=1, num=10000, dtype=float), 'alpha_2': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=BayesianRidge(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_2 = pd.DataFrame({'Model_name': np.repeat('BayesianRidge', 5), 'Fold': np.arange(1, 6), 'Search_strategy': np.repeat('RandomizedSearchCV', 5)})
score_table_2 = pd.concat([score_table_2,pd.DataFrame(scores)], axis=1)
display(score_table_2)
score_tables.append(score_table_2)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,BayesianRidge,1,RandomizedSearchCV,0.072581,0.000663,-2.458269,-3.641981,-11.581165,-26.031032,-1.69235,-2.712542
1,BayesianRidge,2,RandomizedSearchCV,0.053896,0.000751,-3.639642,-3.349663,-24.756527,-23.30966,-2.58755,-2.419532
2,BayesianRidge,3,RandomizedSearchCV,0.057489,0.000629,-3.957304,-3.252824,-30.558525,-23.237184,-2.688478,-2.203321
3,BayesianRidge,4,RandomizedSearchCV,0.069999,0.000619,-5.510999,-2.722108,-79.670062,-13.159449,-2.768604,-2.163332
4,BayesianRidge,5,RandomizedSearchCV,0.0552,0.000611,-4.40917,-3.333501,-28.372947,-23.309659,-4.465688,-2.309659


In [6]:
param_candidates = {'C': [0.001, 0.01, 0.1, 1], 'epsilon': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=SVR(max_iter=1000), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_3 = pd.DataFrame({'Model_name': np.repeat('SVR', 5), 'Fold': np.arange(1, 6), 'Search_strategy': np.repeat('RandomizedSearchCV', 5)})
score_table_3 = pd.concat([score_table_3,pd.DataFrame(scores)], axis=1)
display(score_table_3)
score_tables.append(score_table_3)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,SVR,1,RandomizedSearchCV,0.136864,0.001457,-4.192632,-5.618406,-30.046633,-73.94625,-3.639465,-3.701679
1,SVR,2,RandomizedSearchCV,0.130222,0.001513,-6.629958,-5.615544,-94.382722,-68.928251,-3.786341,-3.712423
2,SVR,3,RandomizedSearchCV,0.131707,0.001599,-7.943106,-4.745354,-132.988945,-55.983782,-4.05678,-3.147712
3,SVR,4,RandomizedSearchCV,0.129353,0.001502,-6.041042,-5.108304,-82.238414,-61.378929,-4.404465,-3.095978
4,SVR,5,RandomizedSearchCV,0.128382,0.001331,-4.080288,-5.673193,-26.403907,-78.736578,-3.502766,-3.666738


In [7]:
param_candidates = {'alpha': np.linspace(start=1e-6, stop=1, num=10000, dtype=float), 'l1_ratio': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=ElasticNet(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_4 = pd.DataFrame({'Model_name': np.repeat('ElasticNet', 5), 'Fold': np.arange(1, 6), 'Search_strategy': np.repeat('RandomizedSearchCV', 5)})
score_table_4 = pd.concat([score_table_4,pd.DataFrame(scores)], axis=1)
display(score_table_4)
score_tables.append(score_table_4)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,ElasticNet,1,RandomizedSearchCV,0.035721,0.0006,-3.009716,-3.83213,-14.876532,-29.615245,-2.489595,-2.750463
1,ElasticNet,2,RandomizedSearchCV,0.033324,0.000597,-3.895006,-3.484463,-29.888683,-25.590314,-2.340087,-2.565406
2,ElasticNet,3,RandomizedSearchCV,0.031472,0.000584,-5.119052,-3.439054,-51.861121,-26.013857,-3.506936,-2.415317
3,ElasticNet,4,RandomizedSearchCV,0.032813,0.000633,-5.389243,-2.756141,-74.478903,-13.804993,-3.075021,-2.193361
4,ElasticNet,5,RandomizedSearchCV,0.031587,0.000588,-3.425298,-3.711596,-19.87402,-27.300789,-3.177395,-2.560129


In [8]:
param_candidates = {'criterion': ['mse', 'friedman_mse', 'mae'], 'max_depth': np.linspace(start=1, stop=10, num=10, dtype=int)} 

param_search = RandomizedSearchCV(estimator=DecisionTreeRegressor(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_5 = pd.DataFrame({'Model_name': np.repeat('DecisionTreeRegressor', 5), 'Fold': np.arange(1, 6), 'Search_strategy': np.repeat('RandomizedSearchCV', 5)})
score_table_5 = pd.concat([score_table_5,pd.DataFrame(scores)], axis=1)
display(score_table_5)
score_tables.append(score_table_5)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,DecisionTreeRegressor,1,RandomizedSearchCV,0.117792,0.000628,-2.642044,-0.777253,-11.907856,-1.438408,-2.066746,-0.469474
1,DecisionTreeRegressor,2,RandomizedSearchCV,0.079965,0.000614,-3.819336,-3.658659,-25.777347,-26.2465,-2.929687,-2.770313
2,DecisionTreeRegressor,3,RandomizedSearchCV,0.079865,0.000623,-3.902986,-1.261149,-26.147605,-2.98181,-2.899,-0.922222
3,DecisionTreeRegressor,4,RandomizedSearchCV,0.162969,0.000684,-4.227167,-2.590354,-53.326858,-10.992454,-2.613333,-1.993333
4,DecisionTreeRegressor,5,RandomizedSearchCV,0.175786,0.000734,-4.15099,-1.48321,-46.003787,-4.848605,-2.3,-1.0


# Generarea raportului

In [9]:
for table in score_tables:
    table[['test_neg_mean_absolute_error']] = -table[['test_neg_mean_absolute_error']]
    table[['train_neg_mean_absolute_error']] = -table[['train_neg_mean_absolute_error']]
    table[['test_neg_mean_squared_error']] = -table[['test_neg_mean_squared_error']]
    table[['train_neg_mean_squared_error']] = -table[['train_neg_mean_squared_error']]
    table[['test_neg_median_absolute_error']] = -table[['test_neg_median_absolute_error']]
    table[['train_neg_median_absolute_error']] = -table[['train_neg_median_absolute_error']]
    table.rename(columns={'test_neg_mean_absolute_error':   'test_mean_absolute_error',
                          'train_neg_mean_absolute_error':  'train_mean_absolute_error',
                          'test_neg_mean_squared_error':    'test_mean_squared_error',
                          'train_neg_mean_squared_error':   'train_mean_squared_error',
                          'test_neg_median_absolute_error': 'test_median_absolute_error', 
                          'train_neg_median_absolute_error':'train_median_absolute_error'}, inplace=True)

In [10]:
def highlight(s):
    is_max = s == s.max()
    is_min = s == s.min()
    styles = []
    for i in range(0, len(is_max)):
        if is_max[i]:
            styles.append('background-color: red')
        elif is_min[i]:
            styles.append('background-color: green')
        else:
            styles.append('')
    return styles

all_scores = pd.concat(score_tables, ignore_index=True)
all_scores_styled = all_scores.style.apply(highlight, subset=['test_mean_absolute_error','train_mean_absolute_error','test_mean_squared_error','train_mean_squared_error','test_median_absolute_error','train_median_absolute_error'])
display(all_scores_styled)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_mean_absolute_error,train_mean_absolute_error,test_mean_squared_error,train_mean_squared_error,test_median_absolute_error,train_median_absolute_error
0,Lasso,1,RandomizedSearchCV,0.310655,0.000848,3.035318,3.843045,15.064119,29.79852,2.538048,2.749917
1,Lasso,2,RandomizedSearchCV,0.300539,0.000585,4.086474,3.588598,33.261663,27.136945,2.481359,2.573424
2,Lasso,3,RandomizedSearchCV,0.282761,0.000578,5.185735,3.460764,53.226085,26.244872,3.494891,2.375632
3,Lasso,4,RandomizedSearchCV,0.283419,0.000569,5.436375,2.751116,77.188161,13.71794,3.063159,2.171184
4,Lasso,5,RandomizedSearchCV,0.283513,0.000583,4.385414,3.360477,27.992657,23.781909,4.28817,2.289129
5,BayesianRidge,1,RandomizedSearchCV,0.072581,0.000663,2.458269,3.641981,11.581165,26.031032,1.69235,2.712542
6,BayesianRidge,2,RandomizedSearchCV,0.053896,0.000751,3.639642,3.349663,24.756527,23.30966,2.58755,2.419532
7,BayesianRidge,3,RandomizedSearchCV,0.057489,0.000629,3.957304,3.252824,30.558525,23.237184,2.688478,2.203321
8,BayesianRidge,4,RandomizedSearchCV,0.069999,0.000619,5.510999,2.722108,79.670062,13.159449,2.768604,2.163332
9,BayesianRidge,5,RandomizedSearchCV,0.0552,0.000611,4.40917,3.333501,28.372947,23.309659,4.465688,2.309659


# Exportarea raportului

In [11]:
html_str = """
<h1><u>Boston Housing</u></h1>
"""
html_str = html_str + (all_scores_styled.render())
html_file = open("./Reports/housing_reg_report.html", "w")
html_file.write(html_str)
html_file.close()