# Modele de regresie: Boston housing

Sîrbu Matei Dan, _grupa 10LF383_

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

In [2]:
header = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data_housing = pd.read_csv("./Datasets/Housing/housing.data", names=header, sep='\s+')
display(HTML('<i>Boston housing dataset overview:</i>'))
display(data_housing)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [3]:
import sklearn
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.linear_model import Lasso, BayesianRidge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

X = data_housing[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']].to_numpy()
y = data_housing[['MEDV']].to_numpy().ravel()
score_tables = []

In [4]:
param_candidates = {'alpha': np.linspace(start=1e-5, stop=1, num=10000, dtype=float) } 

param_search = RandomizedSearchCV(estimator=Lasso(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3), n_iter=100)
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_1 = pd.DataFrame({'Model_name': 'Lasso', 'Fold': np.arange(1, 6), 'Search_strategy': 'RandomizedSearchCV'})
score_table_1 = pd.concat([score_table_1,pd.DataFrame(scores)], axis=1)
display(score_table_1)
score_tables.append(score_table_1)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,Lasso,1,RandomizedSearchCV,0.322133,0.000626,-3.040944,-3.845501,-15.106689,-29.839852,-2.551733,-2.76563
1,Lasso,2,RandomizedSearchCV,0.304299,0.000583,-4.08918,-3.590369,-33.312672,-27.163298,-2.48597,-2.579176
2,Lasso,3,RandomizedSearchCV,0.297352,0.000583,-5.18441,-3.460317,-53.198864,-26.240197,-3.495315,-2.37455
3,Lasso,4,RandomizedSearchCV,0.371675,0.000795,-5.444537,-2.750418,-77.446727,-13.707906,-3.06858,-2.192848
4,Lasso,5,RandomizedSearchCV,0.321185,0.000619,-4.370471,-3.363105,-27.811092,-23.798142,-4.255982,-2.273063


In [5]:
param_candidates = {'alpha_1': np.linspace(start=1e-6, stop=1, num=10000, dtype=float), 'alpha_2': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=BayesianRidge(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_2 = pd.DataFrame({'Model_name': 'BayesianRidge', 'Fold': np.arange(1, 6), 'Search_strategy': 'RandomizedSearchCV'})
score_table_2 = pd.concat([score_table_2,pd.DataFrame(scores)], axis=1)
display(score_table_2)
score_tables.append(score_table_2)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,BayesianRidge,1,RandomizedSearchCV,0.07696,0.000841,-2.458265,-3.641977,-11.58112,-26.030987,-1.692319,-2.71255
1,BayesianRidge,2,RandomizedSearchCV,0.056544,0.000625,-3.639594,-3.349645,-24.755839,-23.309407,-2.587417,-2.419693
2,BayesianRidge,3,RandomizedSearchCV,0.056992,0.000617,-3.957318,-3.252826,-30.558774,-23.237216,-2.688393,-2.203323
3,BayesianRidge,4,RandomizedSearchCV,0.070016,0.000858,-5.510968,-2.722113,-79.668801,-13.159652,-2.768886,-2.162885
4,BayesianRidge,5,RandomizedSearchCV,0.06037,0.000615,-4.409364,-3.333445,-28.375338,-23.309379,-4.466453,-2.309457


In [6]:
param_candidates = {'C': [0.001, 0.01, 0.1, 1], 'epsilon': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=SVR(max_iter=1000), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_3 = pd.DataFrame({'Model_name': 'SVR', 'Fold': np.arange(1, 6), 'Search_strategy': 'RandomizedSearchCV'})
score_table_3 = pd.concat([score_table_3,pd.DataFrame(scores)], axis=1)
display(score_table_3)
score_tables.append(score_table_3)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,SVR,1,RandomizedSearchCV,0.130543,0.001403,-4.227072,-5.606854,-30.156399,-73.499137,-3.730416,-3.66645
1,SVR,2,RandomizedSearchCV,0.134056,0.00163,-6.090175,-5.134438,-87.642218,-62.169859,-3.463028,-3.591983
2,SVR,3,RandomizedSearchCV,0.137415,0.001727,-7.879197,-4.741554,-131.387892,-55.93059,-3.903261,-3.198133
3,SVR,4,RandomizedSearchCV,0.134555,0.001466,-6.056886,-5.116578,-82.461986,-61.105464,-4.422708,-3.105338
4,SVR,5,RandomizedSearchCV,0.137346,0.001449,-4.085762,-5.674818,-26.497525,-78.68299,-3.499857,-3.696604


In [7]:
param_candidates = {'alpha': np.linspace(start=1e-6, stop=1, num=10000, dtype=float), 'l1_ratio': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=ElasticNet(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_4 = pd.DataFrame({'Model_name': 'ElasticNet', 'Fold': np.arange(1, 6), 'Search_strategy': 'RandomizedSearchCV'})
score_table_4 = pd.concat([score_table_4,pd.DataFrame(scores)], axis=1)
display(score_table_4)
score_tables.append(score_table_4)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,ElasticNet,1,RandomizedSearchCV,0.035668,0.000597,-2.961781,-3.81507,-14.588823,-29.30358,-2.377637,-2.73724
1,ElasticNet,2,RandomizedSearchCV,0.031571,0.001299,-3.871292,-3.519651,-30.508791,-26.037138,-2.310901,-2.613771
2,ElasticNet,3,RandomizedSearchCV,0.031362,0.000592,-5.080921,-3.429606,-51.098106,-25.892491,-3.471006,-2.417245
3,ElasticNet,4,RandomizedSearchCV,0.031922,0.000591,-5.163052,-2.822201,-65.905238,-14.586249,-2.879497,-2.216872
4,ElasticNet,5,RandomizedSearchCV,0.031022,0.000586,-3.532948,-3.635839,-20.478447,-26.315777,-3.126598,-2.528097


In [8]:
param_candidates = {'criterion': ['mse', 'friedman_mse', 'mae'], 'max_depth': np.linspace(start=1, stop=10, num=10, dtype=int)} 

param_search = RandomizedSearchCV(estimator=DecisionTreeRegressor(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_5 = pd.DataFrame({'Model_name': 'DecisionTreeRegressor', 'Fold': np.arange(1, 6), 'Search_strategy': 'RandomizedSearchCV'})
score_table_5 = pd.concat([score_table_5,pd.DataFrame(scores)], axis=1)
display(score_table_5)
score_tables.append(score_table_5)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,DecisionTreeRegressor,1,RandomizedSearchCV,0.093173,0.000646,-2.482865,-1.314845,-11.039039,-3.29442,-1.73619,-0.951875
1,DecisionTreeRegressor,2,RandomizedSearchCV,0.091215,0.000612,-4.136143,-2.186944,-36.982743,-8.153316,-2.997802,-1.802198
2,DecisionTreeRegressor,3,RandomizedSearchCV,0.134014,0.001109,-4.04061,-1.261149,-31.411912,-2.98181,-3.1,-0.922222
3,DecisionTreeRegressor,4,RandomizedSearchCV,0.107306,0.000682,-3.989405,-2.071758,-52.667296,-7.687298,-2.162963,-1.55
4,DecisionTreeRegressor,5,RandomizedSearchCV,0.102668,0.000646,-4.731403,-1.882764,-55.571147,-6.229496,-3.294444,-1.544186


# Generarea raportului

In [9]:
for table in score_tables:
    table[['test_neg_mean_absolute_error']] = -table[['test_neg_mean_absolute_error']]
    table[['train_neg_mean_absolute_error']] = -table[['train_neg_mean_absolute_error']]
    table[['test_neg_mean_squared_error']] = -table[['test_neg_mean_squared_error']]
    table[['train_neg_mean_squared_error']] = -table[['train_neg_mean_squared_error']]
    table[['test_neg_median_absolute_error']] = -table[['test_neg_median_absolute_error']]
    table[['train_neg_median_absolute_error']] = -table[['train_neg_median_absolute_error']]
    table.rename(columns={'test_neg_mean_absolute_error':   'test_mean_absolute_error',
                          'train_neg_mean_absolute_error':  'train_mean_absolute_error',
                          'test_neg_mean_squared_error':    'test_mean_squared_error',
                          'train_neg_mean_squared_error':   'train_mean_squared_error',
                          'test_neg_median_absolute_error': 'test_median_absolute_error', 
                          'train_neg_median_absolute_error':'train_median_absolute_error'}, inplace=True)

In [10]:
def highlight(s):
    is_max = s == s.max()
    is_min = s == s.min()
    styles = []
    for i in range(0, len(is_max)):
        if is_max[i]:
            styles.append('background-color: red')
        elif is_min[i]:
            styles.append('background-color: green')
        else:
            styles.append('')
    return styles

all_scores = pd.concat(score_tables, ignore_index=True)
all_scores_styled = all_scores.style.apply(highlight, subset=['test_mean_absolute_error','train_mean_absolute_error','test_mean_squared_error','train_mean_squared_error','test_median_absolute_error','train_median_absolute_error'])
display(all_scores_styled)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_mean_absolute_error,train_mean_absolute_error,test_mean_squared_error,train_mean_squared_error,test_median_absolute_error,train_median_absolute_error
0,Lasso,1,RandomizedSearchCV,0.322133,0.000626,3.040944,3.845501,15.106689,29.839852,2.551733,2.76563
1,Lasso,2,RandomizedSearchCV,0.304299,0.000583,4.08918,3.590369,33.312672,27.163298,2.48597,2.579176
2,Lasso,3,RandomizedSearchCV,0.297352,0.000583,5.18441,3.460317,53.198864,26.240197,3.495315,2.37455
3,Lasso,4,RandomizedSearchCV,0.371675,0.000795,5.444537,2.750418,77.446727,13.707906,3.06858,2.192848
4,Lasso,5,RandomizedSearchCV,0.321185,0.000619,4.370471,3.363105,27.811092,23.798142,4.255982,2.273063
5,BayesianRidge,1,RandomizedSearchCV,0.07696,0.000841,2.458265,3.641977,11.58112,26.030987,1.692319,2.71255
6,BayesianRidge,2,RandomizedSearchCV,0.056544,0.000625,3.639594,3.349645,24.755839,23.309407,2.587417,2.419693
7,BayesianRidge,3,RandomizedSearchCV,0.056992,0.000617,3.957318,3.252826,30.558774,23.237216,2.688393,2.203323
8,BayesianRidge,4,RandomizedSearchCV,0.070016,0.000858,5.510968,2.722113,79.668801,13.159652,2.768886,2.162885
9,BayesianRidge,5,RandomizedSearchCV,0.06037,0.000615,4.409364,3.333445,28.375338,23.309379,4.466453,2.309457


# Exportarea raportului

In [11]:
html_str = """
<h1><u>Boston Housing</u></h1>
"""
html_str = html_str + (all_scores_styled.render())
html_file = open("./Reports/housing_reg_report.html", "w")
html_file.write(html_str)
html_file.close()