# Modele de regresie: CPU Computer Hardware

Sîrbu Matei Dan, _grupa 10LF383_

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

In [2]:
header = ['Vendor Name', 'Model Name', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP']
data_cpu_raw = pd.read_csv("./Datasets/Computer Hardware/machine.data", names=header)
display(HTML('<i>CPU Computer Hardware dataset overview:</i>'))
display(data_cpu_raw)

Unnamed: 0,Vendor Name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...,...
204,sperry,80/8,124,1000,8000,0,1,8,42,37
205,sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
206,sratus,32,125,2000,8000,0,2,14,52,41
207,wang,vs-100,480,512,8000,32,0,0,67,47


In [3]:
data_cpu = data_cpu_raw[['MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP']]
data_cpu = data_cpu.astype(int)
display(HTML('<i>Dataset to be analyzed:</i>'))
display(data_cpu)

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
0,125,256,6000,256,16,128,198
1,29,8000,32000,32,8,32,269
2,29,8000,32000,32,8,32,220
3,29,8000,32000,32,8,32,172
4,29,8000,16000,32,8,16,132
...,...,...,...,...,...,...,...
204,124,1000,8000,0,1,8,42
205,98,1000,8000,32,2,8,46
206,125,2000,8000,0,2,14,52
207,480,512,8000,32,0,0,67


In [4]:
import sklearn
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.linear_model import Lasso, BayesianRidge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler

X = data_cpu[['MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX']].to_numpy()
y = data_cpu[['PRP']].to_numpy().ravel()
score_tables = []

In [5]:
param_candidates = {'alpha': np.linspace(start=1e-5, stop=1, num=10000, dtype=float) } 

param_search = RandomizedSearchCV(estimator=Lasso(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3), n_iter=100)
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_1 = pd.DataFrame({'Model_name': 'Lasso', 'Fold': np.arange(1, 6), 'Search_strategy': 'RandomizedSearchCV'})
score_table_1 = pd.concat([score_table_1,pd.DataFrame(scores)], axis=1)
display(score_table_1)
score_tables.append(score_table_1)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,Lasso,1,RandomizedSearchCV,0.292598,0.001208,-61.304661,-33.542528,-7123.301757,-2659.100585,-42.570747,-24.181218
1,Lasso,2,RandomizedSearchCV,0.268929,0.000549,-31.934621,-40.921976,-2305.983487,-3845.283963,-22.403702,-30.681254
2,Lasso,3,RandomizedSearchCV,0.253522,0.000546,-27.965143,-42.677278,-1495.42145,-4046.452768,-22.038221,-33.221794
3,Lasso,4,RandomizedSearchCV,0.251216,0.000553,-35.272056,-38.429216,-2319.323288,-3823.11597,-23.84241,-23.644019
4,Lasso,5,RandomizedSearchCV,0.252699,0.00056,-60.277228,-27.861359,-18644.538361,-1844.738314,-24.16682,-16.210339


In [6]:
param_candidates = {'alpha_1': np.linspace(start=1e-6, stop=1, num=10000, dtype=float), 'alpha_2': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=BayesianRidge(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_2 = pd.DataFrame({'Model_name': 'BayesianRidge', 'Fold': np.arange(1, 6), 'Search_strategy': 'RandomizedSearchCV'})
score_table_2 = pd.concat([score_table_2,pd.DataFrame(scores)], axis=1)
display(score_table_2)
score_tables.append(score_table_2)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,BayesianRidge,1,RandomizedSearchCV,0.083732,0.000622,-60.351027,-33.60088,-6964.290748,-2674.779209,-42.34044,-23.932244
1,BayesianRidge,2,RandomizedSearchCV,0.043363,0.000696,-31.430214,-40.787084,-2061.482395,-3864.37132,-23.323312,-29.653194
2,BayesianRidge,3,RandomizedSearchCV,0.046825,0.000576,-27.283274,-42.663372,-1411.817179,-4070.810701,-22.106036,-33.431901
3,BayesianRidge,4,RandomizedSearchCV,0.041856,0.000625,-35.311584,-38.31235,-2215.93885,-3842.745187,-26.148412,-23.493802
4,BayesianRidge,5,RandomizedSearchCV,0.045548,0.000771,-56.450443,-28.222315,-17892.9036,-1952.947497,-19.799159,-17.390982


In [7]:
param_candidates = {'C': [0.001, 0.01, 0.1, 1], 'epsilon': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=SVR(max_iter=1000), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_3 = pd.DataFrame({'Model_name': 'SVR', 'Fold': np.arange(1, 6), 'Search_strategy': 'RandomizedSearchCV'})
score_table_3 = pd.concat([score_table_3,pd.DataFrame(scores)], axis=1)
display(score_table_3)
score_tables.append(score_table_3)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,SVR,1,RandomizedSearchCV,0.046386,0.000748,-107.440941,-57.206267,-49463.265523,-19798.745596,-27.258434,-18.220664
1,SVR,2,RandomizedSearchCV,0.041643,0.000709,-24.711845,-75.967345,-1589.055698,-30288.534327,-16.091488,-21.891159
2,SVR,3,RandomizedSearchCV,0.041187,0.000705,-61.254642,-80.899617,-11984.889351,-32465.336972,-28.526068,-25.80557
3,SVR,4,RandomizedSearchCV,0.041455,0.000705,-61.110636,-68.904558,-14297.941479,-28519.001942,-20.728564,-19.247339
4,SVR,5,RandomizedSearchCV,0.041009,0.000688,-98.832994,-60.323948,-55025.223835,-18847.89555,-22.358735,-19.084554


In [8]:
param_candidates = {'alpha': np.linspace(start=1e-6, stop=1, num=10000, dtype=float), 'l1_ratio': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=ElasticNet(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_4 = pd.DataFrame({'Model_name': 'ElasticNet', 'Fold': np.arange(1, 6), 'Search_strategy': 'RandomizedSearchCV'})
score_table_4 = pd.concat([score_table_4,pd.DataFrame(scores)], axis=1)
display(score_table_4)
score_tables.append(score_table_4)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,ElasticNet,1,RandomizedSearchCV,0.032785,0.000559,-61.298248,-33.544207,-7122.543061,-2659.099053,-42.567671,-24.19278
1,ElasticNet,2,RandomizedSearchCV,0.029563,0.000669,-31.934193,-40.924946,-2304.342996,-3845.272006,-22.413,-30.677393
2,ElasticNet,3,RandomizedSearchCV,0.02803,0.000551,-27.975239,-42.679003,-1496.539291,-4046.44421,-22.048111,-33.224472
3,ElasticNet,4,RandomizedSearchCV,0.027928,0.000548,-35.285309,-38.431147,-2321.13747,-3823.090942,-23.868144,-23.66941
4,ElasticNet,5,RandomizedSearchCV,0.027902,0.000559,-58.686494,-27.848728,-18388.328708,-1845.95424,-23.477137,-16.393197


In [9]:
param_candidates = {'criterion': ['mse', 'friedman_mse', 'mae'], 'max_depth': np.linspace(start=1, stop=10, num=10, dtype=int)} 

param_search = RandomizedSearchCV(estimator=DecisionTreeRegressor(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_5 = pd.DataFrame({'Model_name': 'DecisionTreeRegressor', 'Fold': np.arange(1, 6), 'Search_strategy': 'RandomizedSearchCV'})
score_table_5 = pd.concat([score_table_5,pd.DataFrame(scores)], axis=1)
display(score_table_5)
score_tables.append(score_table_5)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,DecisionTreeRegressor,1,RandomizedSearchCV,0.038138,0.000595,-50.940476,-2.622754,-8413.839286,-63.131737,-20.0,-0.0
1,DecisionTreeRegressor,2,RandomizedSearchCV,0.036082,0.000586,-19.181349,-4.621158,-1197.728948,-137.576547,-9.75,-0.333333
2,DecisionTreeRegressor,3,RandomizedSearchCV,0.031029,0.000587,-20.467262,-6.506977,-964.948537,-194.877272,-11.520833,-2.333333
3,DecisionTreeRegressor,4,RandomizedSearchCV,0.035362,0.000597,-36.575397,-4.842382,-2217.198743,-133.610113,-26.5,-1.0
4,DecisionTreeRegressor,5,RandomizedSearchCV,0.031996,0.00059,-80.8268,-7.879578,-33260.464466,-184.688131,-20.0,-4.3125


# Generarea raportului

In [10]:
for table in score_tables:
    table[['test_neg_mean_absolute_error']] = -table[['test_neg_mean_absolute_error']]
    table[['train_neg_mean_absolute_error']] = -table[['train_neg_mean_absolute_error']]
    table[['test_neg_mean_squared_error']] = -table[['test_neg_mean_squared_error']]
    table[['train_neg_mean_squared_error']] = -table[['train_neg_mean_squared_error']]
    table[['test_neg_median_absolute_error']] = -table[['test_neg_median_absolute_error']]
    table[['train_neg_median_absolute_error']] = -table[['train_neg_median_absolute_error']]
    table.rename(columns={'test_neg_mean_absolute_error':   'test_mean_absolute_error',
                          'train_neg_mean_absolute_error':  'train_mean_absolute_error',
                          'test_neg_mean_squared_error':    'test_mean_squared_error',
                          'train_neg_mean_squared_error':   'train_mean_squared_error',
                          'test_neg_median_absolute_error': 'test_median_absolute_error', 
                          'train_neg_median_absolute_error':'train_median_absolute_error'}, inplace=True)

In [11]:
def highlight(s):
    is_max = s == s.max()
    is_min = s == s.min()
    styles = []
    for i in range(0, len(is_max)):
        if is_max[i]:
            styles.append('background-color: red')
        elif is_min[i]:
            styles.append('background-color: green')
        else:
            styles.append('')
    return styles

all_scores = pd.concat(score_tables, ignore_index=True)
all_scores_styled = all_scores.style.apply(highlight, subset=['test_mean_absolute_error','train_mean_absolute_error','test_mean_squared_error','train_mean_squared_error','test_median_absolute_error','train_median_absolute_error'])
display(all_scores_styled)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_mean_absolute_error,train_mean_absolute_error,test_mean_squared_error,train_mean_squared_error,test_median_absolute_error,train_median_absolute_error
0,Lasso,1,RandomizedSearchCV,0.292598,0.001208,61.304661,33.542528,7123.301757,2659.100585,42.570747,24.181218
1,Lasso,2,RandomizedSearchCV,0.268929,0.000549,31.934621,40.921976,2305.983487,3845.283963,22.403702,30.681254
2,Lasso,3,RandomizedSearchCV,0.253522,0.000546,27.965143,42.677278,1495.42145,4046.452768,22.038221,33.221794
3,Lasso,4,RandomizedSearchCV,0.251216,0.000553,35.272056,38.429216,2319.323288,3823.11597,23.84241,23.644019
4,Lasso,5,RandomizedSearchCV,0.252699,0.00056,60.277228,27.861359,18644.538361,1844.738314,24.16682,16.210339
5,BayesianRidge,1,RandomizedSearchCV,0.083732,0.000622,60.351027,33.60088,6964.290748,2674.779209,42.34044,23.932244
6,BayesianRidge,2,RandomizedSearchCV,0.043363,0.000696,31.430214,40.787084,2061.482395,3864.37132,23.323312,29.653194
7,BayesianRidge,3,RandomizedSearchCV,0.046825,0.000576,27.283274,42.663372,1411.817179,4070.810701,22.106036,33.431901
8,BayesianRidge,4,RandomizedSearchCV,0.041856,0.000625,35.311584,38.31235,2215.93885,3842.745187,26.148412,23.493802
9,BayesianRidge,5,RandomizedSearchCV,0.045548,0.000771,56.450443,28.222315,17892.9036,1952.947497,19.799159,17.390982


# Exportarea raportului

In [12]:
html_str = """
<h1><u>CPU computer hardware</u></h1>
"""
html_str = html_str + (all_scores_styled.render())
html_file = open("./Reports/cpu_reg_report.html", "w")
html_file.write(html_str)
html_file.close()