# Modele de regresie: CPU Computer Hardware

Sîrbu Matei Dan, _grupa 10LF383_

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

In [2]:
header = ['Vendor Name', 'Model Name', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP']
data_cpu_raw = pd.read_csv("./Datasets/Computer Hardware/machine.data", names=header)
display(HTML('<i>CPU Computer Hardware dataset overview:</i>'))
display(data_cpu_raw)

Unnamed: 0,Vendor Name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...,...
204,sperry,80/8,124,1000,8000,0,1,8,42,37
205,sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
206,sratus,32,125,2000,8000,0,2,14,52,41
207,wang,vs-100,480,512,8000,32,0,0,67,47


In [3]:
data_cpu = data_cpu_raw[['MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP']]
data_cpu = data_cpu.astype(int)
display(HTML('<i>Dataset to be analyzed:</i>'))
display(data_cpu)

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
0,125,256,6000,256,16,128,198
1,29,8000,32000,32,8,32,269
2,29,8000,32000,32,8,32,220
3,29,8000,32000,32,8,32,172
4,29,8000,16000,32,8,16,132
...,...,...,...,...,...,...,...
204,124,1000,8000,0,1,8,42
205,98,1000,8000,32,2,8,46
206,125,2000,8000,0,2,14,52
207,480,512,8000,32,0,0,67


In [4]:
import sklearn
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.linear_model import Lasso, BayesianRidge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler

X = data_cpu[['MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX']].to_numpy()
y = data_cpu[['PRP']].to_numpy().ravel()
score_tables = []

In [5]:
param_candidates = {'alpha': np.linspace(start=1e-5, stop=1, num=10000, dtype=float) } 

param_search = RandomizedSearchCV(estimator=Lasso(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3), n_iter=100)
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_1 = pd.DataFrame({'Model_name': np.repeat('Lasso', 5), 'Fold': np.arange(1, 6), 'Search_strategy': np.repeat('RandomizedSearchCV', 5)})
score_table_1 = pd.concat([score_table_1,pd.DataFrame(scores)], axis=1)
display(score_table_1)
score_tables.append(score_table_1)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,Lasso,1,RandomizedSearchCV,0.271561,0.000601,-61.305867,-33.54289,-7123.602957,-2659.09702,-42.570775,-24.179804
1,Lasso,2,RandomizedSearchCV,0.262462,0.000552,-31.934554,-40.921927,-2305.962094,-3845.284247,-22.403228,-30.68129
2,Lasso,3,RandomizedSearchCV,0.252442,0.000578,-27.965063,-42.677265,-1495.411138,-4046.45292,-22.038143,-33.221831
3,Lasso,4,RandomizedSearchCV,0.250499,0.000551,-35.272646,-38.429436,-2319.461841,-3823.113818,-23.843033,-23.645018
4,Lasso,5,RandomizedSearchCV,0.255944,0.000547,-60.27901,-27.861408,-18644.910224,-1844.73831,-24.167595,-16.210464


In [6]:
param_candidates = {'alpha_1': np.linspace(start=1e-6, stop=1, num=10000, dtype=float), 'alpha_2': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=BayesianRidge(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_2 = pd.DataFrame({'Model_name': np.repeat('BayesianRidge', 5), 'Fold': np.arange(1, 6), 'Search_strategy': np.repeat('RandomizedSearchCV', 5)})
score_table_2 = pd.concat([score_table_2,pd.DataFrame(scores)], axis=1)
display(score_table_2)
score_tables.append(score_table_2)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,BayesianRidge,1,RandomizedSearchCV,0.055011,0.000602,-60.349225,-33.601253,-6963.987456,-2674.849623,-42.339617,-23.937021
1,BayesianRidge,2,RandomizedSearchCV,0.044367,0.000589,-31.430321,-40.787111,-2061.55099,-3864.361418,-23.323074,-29.653463
2,BayesianRidge,3,RandomizedSearchCV,0.046741,0.000701,-27.282863,-42.663295,-1411.783889,-4070.859705,-22.105047,-33.43149
3,BayesianRidge,4,RandomizedSearchCV,0.048407,0.000728,-35.311586,-38.312349,-2215.938458,-3842.745386,-26.148431,-23.493798
4,BayesianRidge,5,RandomizedSearchCV,0.04221,0.000686,-56.451432,-28.222265,-17893.236336,-1952.97596,-19.798522,-17.390695


In [7]:
param_candidates = {'C': [0.001, 0.01, 0.1, 1], 'epsilon': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=SVR(max_iter=1000), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_3 = pd.DataFrame({'Model_name': np.repeat('SVR', 5), 'Fold': np.arange(1, 6), 'Search_strategy': np.repeat('RandomizedSearchCV', 5)})
score_table_3 = pd.concat([score_table_3,pd.DataFrame(scores)], axis=1)
display(score_table_3)
score_tables.append(score_table_3)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,SVR,1,RandomizedSearchCV,0.045469,0.000712,-107.344447,-57.113726,-49411.376489,-19764.736405,-27.492795,-18.212323
1,SVR,2,RandomizedSearchCV,0.04271,0.00072,-24.798887,-75.964988,-1588.354594,-30257.711682,-16.352613,-21.630033
2,SVR,3,RandomizedSearchCV,0.041527,0.000711,-48.862505,-72.51341,-9264.485153,-29385.540133,-20.508142,-22.389687
3,SVR,4,RandomizedSearchCV,0.041737,0.000709,-61.072164,-68.927711,-14271.456001,-28486.971122,-21.051624,-19.572271
4,SVR,5,RandomizedSearchCV,0.041605,0.000747,-109.671007,-68.703339,-59492.232761,-21399.46487,-26.670889,-24.320282


In [8]:
param_candidates = {'alpha': np.linspace(start=1e-6, stop=1, num=10000, dtype=float), 'l1_ratio': np.linspace(start=1e-6, stop=1, num=10000, dtype=float)} 

param_search = RandomizedSearchCV(estimator=ElasticNet(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_4 = pd.DataFrame({'Model_name': np.repeat('ElasticNet', 5), 'Fold': np.arange(1, 6), 'Search_strategy': np.repeat('RandomizedSearchCV', 5)})
score_table_4 = pd.concat([score_table_4,pd.DataFrame(scores)], axis=1)
display(score_table_4)
score_tables.append(score_table_4)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,ElasticNet,1,RandomizedSearchCV,0.033046,0.000586,-61.293317,-33.544796,-7121.798476,-2659.102338,-42.565863,-24.200876
1,ElasticNet,2,RandomizedSearchCV,0.029246,0.000559,-31.94836,-40.933474,-2309.774782,-3845.240286,-22.506308,-30.672267
2,ElasticNet,3,RandomizedSearchCV,0.028997,0.000929,-27.974507,-42.678875,-1496.464212,-4046.444434,-22.047392,-33.224029
3,ElasticNet,4,RandomizedSearchCV,0.028455,0.000548,-35.274293,-38.430011,-2319.825072,-3823.108306,-23.845133,-23.64877
4,ElasticNet,5,RandomizedSearchCV,0.028367,0.000556,-58.957417,-27.848123,-18424.263825,-1845.576245,-23.595672,-16.455677


In [9]:
param_candidates = {'criterion': ['mse', 'friedman_mse', 'mae'], 'max_depth': np.linspace(start=1, stop=10, num=10, dtype=int)} 

param_search = RandomizedSearchCV(estimator=DecisionTreeRegressor(), scoring='neg_mean_squared_error', param_distributions=param_candidates, cv=KFold(3))
scores = cross_validate(param_search, X, y, cv=KFold(5), scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
score_table_5 = pd.DataFrame({'Model_name': np.repeat('DecisionTreeRegressor', 5), 'Fold': np.arange(1, 6), 'Search_strategy': np.repeat('RandomizedSearchCV', 5)})
score_table_5 = pd.concat([score_table_5,pd.DataFrame(scores)], axis=1)
display(score_table_5)
score_tables.append(score_table_5)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_median_absolute_error
0,DecisionTreeRegressor,1,RandomizedSearchCV,0.038723,0.000605,-51.738095,-6.706587,-8541.642857,-206.640719,-21.5,-2.0
1,DecisionTreeRegressor,2,RandomizedSearchCV,0.03921,0.000592,-23.571429,-15.389222,-1623.190476,-670.850299,-15.5,-7.0
2,DecisionTreeRegressor,3,RandomizedSearchCV,0.033809,0.000602,-30.453571,-15.577763,-2765.247393,-518.177718,-13.722222,-9.222222
3,DecisionTreeRegressor,4,RandomizedSearchCV,0.038073,0.000591,-42.099206,-4.842382,-3078.627315,-133.610113,-30.5,-1.0
4,DecisionTreeRegressor,5,RandomizedSearchCV,0.032823,0.0006,-82.582898,-7.879578,-33459.098613,-184.688131,-20.0,-4.3125


# Generarea raportului

In [10]:
for table in score_tables:
    table[['test_neg_mean_absolute_error']] = -table[['test_neg_mean_absolute_error']]
    table[['train_neg_mean_absolute_error']] = -table[['train_neg_mean_absolute_error']]
    table[['test_neg_mean_squared_error']] = -table[['test_neg_mean_squared_error']]
    table[['train_neg_mean_squared_error']] = -table[['train_neg_mean_squared_error']]
    table[['test_neg_median_absolute_error']] = -table[['test_neg_median_absolute_error']]
    table[['train_neg_median_absolute_error']] = -table[['train_neg_median_absolute_error']]
    table.rename(columns={'test_neg_mean_absolute_error':   'test_mean_absolute_error',
                          'train_neg_mean_absolute_error':  'train_mean_absolute_error',
                          'test_neg_mean_squared_error':    'test_mean_squared_error',
                          'train_neg_mean_squared_error':   'train_mean_squared_error',
                          'test_neg_median_absolute_error': 'test_median_absolute_error', 
                          'train_neg_median_absolute_error':'train_median_absolute_error'}, inplace=True)

In [11]:
def highlight(s):
    is_max = s == s.max()
    is_min = s == s.min()
    styles = []
    for i in range(0, len(is_max)):
        if is_max[i]:
            styles.append('background-color: red')
        elif is_min[i]:
            styles.append('background-color: green')
        else:
            styles.append('')
    return styles

all_scores = pd.concat(score_tables, ignore_index=True)
all_scores_styled = all_scores.style.apply(highlight, subset=['test_mean_absolute_error','train_mean_absolute_error','test_mean_squared_error','train_mean_squared_error','test_median_absolute_error','train_median_absolute_error'])
display(all_scores_styled)

Unnamed: 0,Model_name,Fold,Search_strategy,fit_time,score_time,test_mean_absolute_error,train_mean_absolute_error,test_mean_squared_error,train_mean_squared_error,test_median_absolute_error,train_median_absolute_error
0,Lasso,1,RandomizedSearchCV,0.271561,0.000601,61.305867,33.54289,7123.602957,2659.09702,42.570775,24.179804
1,Lasso,2,RandomizedSearchCV,0.262462,0.000552,31.934554,40.921927,2305.962094,3845.284247,22.403228,30.68129
2,Lasso,3,RandomizedSearchCV,0.252442,0.000578,27.965063,42.677265,1495.411138,4046.45292,22.038143,33.221831
3,Lasso,4,RandomizedSearchCV,0.250499,0.000551,35.272646,38.429436,2319.461841,3823.113818,23.843033,23.645018
4,Lasso,5,RandomizedSearchCV,0.255944,0.000547,60.27901,27.861408,18644.910224,1844.73831,24.167595,16.210464
5,BayesianRidge,1,RandomizedSearchCV,0.055011,0.000602,60.349225,33.601253,6963.987456,2674.849623,42.339617,23.937021
6,BayesianRidge,2,RandomizedSearchCV,0.044367,0.000589,31.430321,40.787111,2061.55099,3864.361418,23.323074,29.653463
7,BayesianRidge,3,RandomizedSearchCV,0.046741,0.000701,27.282863,42.663295,1411.783889,4070.859705,22.105047,33.43149
8,BayesianRidge,4,RandomizedSearchCV,0.048407,0.000728,35.311586,38.312349,2215.938458,3842.745386,26.148431,23.493798
9,BayesianRidge,5,RandomizedSearchCV,0.04221,0.000686,56.451432,28.222265,17893.236336,1952.97596,19.798522,17.390695


# Exportarea raportului

In [12]:
html_str = """
<h1><u>CPU computer hardware</u></h1>
"""
html_str = html_str + (all_scores_styled.render())
html_file = open("./Reports/cpu_reg_report.html", "w")
html_file.write(html_str)
html_file.close()