In [1]:
import numpy as np
print(f'NumPy version: {np.__version__}')

import pandas as pd
print(f'Pandas version: {pd.__version__}')

import sklearn
print(f'Sklearn version: {sklearn.__version__}')

from typing import List, Dict, Tuple, Any, Callable

NumPy version: 1.18.5
Pandas version: 1.0.5
Sklearn version: 0.23.1


# <u> Citirea seturilor de date </u> 

## CPU Computer Hardware

In [2]:
names_list: List[str] = ['vendor name', 'Model Name', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP']
machine_data: pd.DataFrame = pd.read_csv('./data/machine.data', names = names_list, delimiter = ',')

machine_data = machine_data.drop(columns = ['vendor name', 'Model Name', 'ERP'])

machine_data.head()

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
0,125,256,6000,256,16,128,198
1,29,8000,32000,32,8,32,269
2,29,8000,32000,32,8,32,220
3,29,8000,32000,32,8,32,172
4,29,8000,16000,32,8,16,132


In [3]:
X_machine_data: np.ndarray = machine_data.values[:, :-1]
y_machine_data: np.ndarray = machine_data.values[:, -1]

## Boston Housing

In [4]:
names_list: List[str] = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 
                            'B', 'LSTAT', 'MEDV']
housing_data: pd.DataFrame = pd.read_csv('./data/housing.data', names = names_list, delimiter = r'\s+')

housing_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
X_housing_data: np.ndarray = housing_data.values[:, :-1]
y_housing_data: np.ndarray = housing_data.values[:, -1]

## Wisconsin Breast Cancer

In [6]:
 r_wpbc_data: pd.DataFrame = pd.read_csv('./data/r_wpbc.data', header = None, delimiter = ',')

r_wpbc_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,5,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,0.07055,0.1865,...,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,31
1,2,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0,61
2,0,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,0.0818,0.2333,...,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5,116
3,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0,123
4,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5,27


In [7]:
X_r_wpbc_data: np.ndarray = r_wpbc_data.values[:, :-1]
y_r_wpbc_data: np.ndarray = r_wpbc_data.values[:, -1]

## Communities and Crime

In [8]:
communities_data: pd.DataFrame = pd.read_csv('./data/communities.data', header = None, delimiter = ',', na_values = '?')

communities_data = communities_data.drop(columns = [0, 1, 2, 3, 4])
communities_data.dropna(axis = 1, inplace = True)

communities_data.head()

Unnamed: 0,5,6,7,8,9,10,11,12,13,14,...,96,97,98,99,100,118,119,120,125,127
0,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.42,0.5,0.51,0.64,0.12,0.26,0.2,0.32,0.2
1,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.21,0.5,0.34,0.6,0.52,0.02,0.12,0.45,0.0,0.67
2,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.0,0.43
3,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,0.21,...,0.19,0.3,0.73,0.64,0.65,0.02,0.39,0.28,0.0,0.12
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.0,0.03


In [9]:
X_communities_data: np.ndarray = communities_data.values[:, :-1]
y_communities_data: np.ndarray = communities_data.values[:, -1]

# <u> Modele de regresie </u>

In [10]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn.svm import SVR

from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [11]:
def get_mean_scores(scores: Dict[str, List[float]]) -> np.ndarray:
    """
    Transformes the values (list of numbers) of the given dictionary into their average. 
    
    :param scores: a dictionary that has list values
    
    :returns: the transformed dictionary
    """
    for key in scores.keys():
        scores[key] = scores[key].mean()
    return scores

In [12]:
def model_result(pipe, X: np.ndarray, y: np.ndarray, 
                 parameter_grid: Dict[str, Any], n_iter: int) -> Tuple[Dict[str, List[float]], Dict[str, List[float]]]:
        
    """
    Applies cross_validate (5 fold cross validation) and report mean squared error, mean absolute error 
    and median absolute error.
    The optimal hyperparameters are being found using both GridSearchCV and
    RandomizedSearchedCV with cv = 3 and metrics beeing mean squared error.

    :param pipe: the pipeline used in both GridSearchCV and RandomizedSearchedCV
    :param X: dataset feautures
    :param y: dataset labels
    :param parameter_grid: a dictionary with the possible values for the hyperparameters
    :n_iter: the number of iterations for RandomizedSearchCV
    
    :returns: 2 dictionaries containing the results of the model
    """    
    scores: List[str] = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error']
    grid_search: Dict[str, List[float]] = cross_validate(GridSearchCV(pipe, 
                                                                  param_grid = parameter_grid, 
                                                                  scoring = 'neg_mean_squared_error', 
                                                                  cv = n_iter), 
                                                  X, y, cv = 5, 
                                                  scoring = scores, 
                                                  return_train_score = True)
        
    randomized_search: Dict[str, List[float]] = cross_validate(RandomizedSearchCV(pipe, 
                                                                  param_distributions = parameter_grid, 
                                                                  scoring = 'neg_mean_squared_error', 
                                                                  cv = 3, 
                                                                  n_iter = n_iter), 
                                                  X, y, cv = 5, 
                                                  scoring = scores, 
                                                  return_train_score = True)
                                                
    grid_search: Dict[str, List[float]] = get_mean_scores(grid_search)                                            
    randomized_search: Dict[str, List[float]] = get_mean_scores(randomized_search) 
    return grid_search, randomized_search

## SVR

In [13]:
def svr(X: np.ndarray, y: np.ndarray) -> Tuple[Dict[str, List[float]], Dict[str, List[float]]]:
    """
    Applies SVR regression on a data set 
    
    :param X: dataset feautures
    :param y: dataset labels
    
    :returns: 2 dictionaries containing the results of the model, results which are
              found in get_model_results function
    """
    pipe: sklearn.Pipeline = Pipeline([('scaler', MinMaxScaler()), ('svr_reg', SVR())])
    
    parameter_grid: Dict[str, List[Any]] = {'svr_reg__kernel': ['linear', 'poly'], 
                      'svr_reg__gamma': ['scale', 'auto']}    
    
    return model_result(pipe, X, y, parameter_grid, n_iter = 3)

## Lasso

In [14]:
def lasso(X: np.ndarray, y: np.ndarray) -> Tuple[Dict[str, List[float]], Dict[str, List[float]]]:
    """
    Applies Lasso regression on a data set 
    
    :param X: dataset feautures
    :param y: dataset labels
    
    :returns: 2 dictionaries containing the results of the model, results which are
              found in get_model_results function
    """
    pipe: sklearn.Pipeline = Pipeline([('scaler', MinMaxScaler()), ('lasso', Lasso(max_iter=20000))])
    
    parameter_grid: Dict[str, List[Any]] = { 'lasso__fit_intercept': [True, False], 
                      'lasso__normalize': [True, False]}   
                                                
    return model_result(pipe, X, y, parameter_grid, n_iter = 4)

## Ridge

In [15]:
def ridge(X: np.ndarray, y: np.ndarray) -> Tuple[Dict[str, List[float]], Dict[str, List[float]]]:
    """
    Applies Ridge regression on a data set 
    
    :param X: dataset feautures
    :param y: dataset labels
    
    :returns: 2 dictionaries containing the results of the model, results which are
              found in get_model_results function
    """
    pipe: sklearn.Pipeline = Pipeline([('scaler', MinMaxScaler()), ('ridge', Ridge())])
    
    parameter_grid: Dict[str, List[Any]] = {'ridge__alpha': [0.001, 0.01, 0.1],
                      'ridge__solver': ['auto', 'svd', 'cholesky'], 
                      'ridge__normalize': [True, False]}    
                                                
    return model_result(pipe, X, y, parameter_grid, n_iter = 10)

## ElasticNet

In [16]:
def elastic_net(X: np.ndarray, y: np.ndarray) -> Tuple[Dict[str, List[float]], Dict[str, List[float]]]:
    """
    Applies ElasticNet regression on a data set 
    
    :param X: dataset feautures
    :param y: dataset labels
    
    :returns: 2 dictionaries containing the results of the model, results which are
              found in get_model_results function
    """
    pipe: sklearn.Pipeline = Pipeline([('scaler', MinMaxScaler()), ('elastic_net', ElasticNet())])
    
    parameter_grid: Dict[str, List[Any]] = {'elastic_net__alpha': [0.001, 0.01, 0.1], 
                      'elastic_net__l1_ratio': [0.3, 0.5, 0.7], 
                      'elastic_net__selection': ['cyclic', 'random']} 
                                                
    return model_result(pipe, X, y, parameter_grid, n_iter = 6)

## SGDRegressor

In [17]:
def sgd_regressor(X: np.ndarray, y: np.ndarray) -> Tuple[Dict[str, List[float]], Dict[str, List[float]]]:
    """
    Applies SGDRegressor regression on a data set 
    
    :param X: dataset feautures
    :param y: dataset labels
    
    :returns: 2 dictionaries containing the results of the model, results which are
              found in get_model_results function
    """
    pipe: sklearn.Pipeline = Pipeline([('scaler',  MinMaxScaler()), ('sgd_reg', SGDRegressor(max_iter=20000))])
    
    parameter_grid: Dict[str, List[Any]] = {'sgd_reg__loss': ['squared_loss', 'huber'], 
                      'sgd_reg__penalty': ['l2', 'l1']} 
                                            
    return model_result(pipe, X, y, parameter_grid, n_iter = 3)

# <u> Aplicarea modelelor pe seturile de date </u>

In [18]:
def apply_models_on_data(X: np.ndarray, y: np.ndarray, models: List[Any]):
    """
    Applies the given models on the given dataset and creates a data frame with the results  
    
    :param X: dataset feautures
    :param y: dataset labels
    :param models: list of models to be called
    
    :return: data frame with each model's results using GridSearchCV and RandomizedSearchedCV
    """
    table: pd.DataFrame = pd.DataFrame()
    
    for model in models:
        X_copy, y_copy = X.copy(), y.copy()
        grid, randomized = model(X_copy, y_copy) 
        
        gridCV: Dict[str, Any] = {'Search_srategy' : 'GridSearchCV', 'Model_name' : models_names[model.__name__]}
        gridCV.update(grid)
        randomizedCV: Dict[str, Any] = {'Search_srategy' : 'RandomizedSearchCV', 'Model_name' : models_names[model.__name__]}
        randomizedCV.update(randomized)
        
        table = table.append(gridCV, ignore_index=True)
        table = table.append(randomizedCV, ignore_index=True)
        
    return table

In [19]:
models: List[Callable] = [svr, lasso, ridge, elastic_net, sgd_regressor]
models_names: Dict[str, str] = { 'svr': 'SVR',
                                'lasso': 'Lasso',
                                 'ridge': 'Ridge',
                                 'elastic_net': 'ElasticNet',
                                 'sgd_regressor': 'SGDRegressor'}

## CPU Computer Hardware

In [20]:
machine_data_table: pd.DataFrame = apply_models_on_data(X_machine_data, y_machine_data, models)
display(machine_data_table)

Unnamed: 0,Model_name,Search_srategy,fit_time,score_time,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error
0,SVR,GridSearchCV,0.052858,0.001597,-75.591254,-55328.814704,-22.793382,-65.054634,-22537.388389,-21.072565
1,SVR,RandomizedSearchCV,0.03012,0.001595,-72.801387,-26466.247064,-23.363063,-72.52665,-26372.176891,-21.924749
2,Lasso,GridSearchCV,0.041564,0.000798,-41.382581,-6382.713437,-25.105926,-35.874061,-3424.060352,-22.963974
3,Lasso,RandomizedSearchCV,0.039094,0.0002,-41.382581,-6382.713437,-25.105926,-35.874061,-3424.060352,-22.963974
4,Ridge,GridSearchCV,0.597796,0.000199,-41.756737,-6161.458456,-24.05009,-35.533432,-3310.720695,-22.962888
5,Ridge,RandomizedSearchCV,0.103053,0.000598,-41.741437,-6234.030659,-23.856883,-35.463698,-3318.029748,-22.755905
6,ElasticNet,GridSearchCV,0.25619,0.0,-40.719804,-6088.776828,-23.53724,-35.018865,-3663.906375,-20.623071
7,ElasticNet,RandomizedSearchCV,0.040621,0.0,-41.928784,-6241.881147,-24.602068,-35.548489,-3999.476994,-20.951418
8,SGDRegressor,GridSearchCV,0.631102,0.0,-44.677158,-6373.01087,-28.934983,-37.145343,-3279.752912,-26.168195
9,SGDRegressor,RandomizedSearchCV,0.51238,0.0,-44.802206,-6377.647128,-28.99212,-37.185133,-3285.866103,-26.184317


## Boston Housing

In [21]:
housing_data_table: pd.DataFrame = apply_models_on_data(X_housing_data, y_housing_data, models)
display(housing_data_table)

Unnamed: 0,Model_name,Search_srategy,fit_time,score_time,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error
0,SVR,GridSearchCV,0.103337,0.001197,-3.852212,-36.430662,-2.37175,-3.352318,-29.982779,-1.958605
1,SVR,RandomizedSearchCV,0.068727,0.0,-3.852212,-36.430662,-2.37175,-3.352318,-29.982779,-1.958605
2,Lasso,GridSearchCV,0.050415,0.0002,-5.244676,-59.182797,-3.328131,-4.69381,-48.241963,-2.897592
3,Lasso,RandomizedSearchCV,0.033448,0.0,-5.244676,-59.182797,-3.328131,-4.69381,-48.241963,-2.897592
4,Ridge,GridSearchCV,0.581108,0.0,-3.892439,-32.936868,-2.891887,-3.178637,-21.578798,-2.247513
5,Ridge,RandomizedSearchCV,0.096852,0.003124,-3.892439,-32.936868,-2.891887,-3.178637,-21.578798,-2.247513
6,ElasticNet,GridSearchCV,0.281191,0.003118,-3.890242,-32.51612,-2.797108,-3.222652,-22.025292,-2.230382
7,ElasticNet,RandomizedSearchCV,0.046864,0.0,-4.466425,-42.31833,-3.238854,-3.769753,-30.679706,-2.654402
8,SGDRegressor,GridSearchCV,0.228078,0.003124,-3.709193,-31.594458,-2.499722,-3.232163,-22.594661,-2.24993
9,SGDRegressor,RandomizedSearchCV,0.159337,0.003125,-3.712045,-31.138235,-2.534461,-3.233052,-22.571975,-2.243376


## Wisconsin Breast Cancer

In [22]:
r_wpbc_data_table: pd.DataFrame = apply_models_on_data(X_r_wpbc_data, y_r_wpbc_data, models)
display(r_wpbc_data_table)

Unnamed: 0,Model_name,Search_srategy,fit_time,score_time,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error
0,SVR,GridSearchCV,0.049546,0.001197,-32.241031,-1468.057133,-28.09736,-23.583796,-889.788593,-21.349809
1,SVR,RandomizedSearchCV,0.029698,0.001809,-32.087469,-1444.508609,-28.740919,-24.464151,-931.072517,-22.093609
2,Lasso,GridSearchCV,0.033445,0.00333,-31.634567,-1423.42329,-29.563356,-27.007991,-1043.746477,-24.823834
3,Lasso,RandomizedSearchCV,0.031243,0.0,-31.634567,-1423.42329,-29.563356,-27.007991,-1043.746477,-24.823834
4,Ridge,GridSearchCV,0.474734,0.0,-28.921968,-1206.37708,-27.624639,-23.508781,-814.23909,-21.260202
5,Ridge,RandomizedSearchCV,0.071858,0.003124,-28.921968,-1206.37708,-27.624639,-23.508781,-814.23909,-21.260202
6,ElasticNet,GridSearchCV,0.319182,0.0,-29.230011,-1233.986002,-27.516844,-24.767572,-880.007362,-22.285832
7,ElasticNet,RandomizedSearchCV,0.046864,0.003124,-29.751248,-1249.114374,-28.394692,-25.056995,-896.912825,-23.148582
8,SGDRegressor,GridSearchCV,0.124964,0.003124,-28.844975,-1193.453498,-27.254648,-24.415004,-881.294906,-21.513568
9,SGDRegressor,RandomizedSearchCV,0.090615,0.0,-28.982762,-1204.702427,-27.5033,-24.492466,-886.134102,-21.799197


## Communities and Crime

In [23]:
communities_data_table: pd.DataFrame = apply_models_on_data(X_communities_data, y_communities_data, models)
display(communities_data_table)

Unnamed: 0,Model_name,Search_srategy,fit_time,score_time,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error
0,SVR,GridSearchCV,2.073046,0.021864,-0.096783,-0.019204,-0.067397,-0.09166,-0.016837,-0.067394
1,SVR,RandomizedSearchCV,1.609014,0.021869,-0.096783,-0.019204,-0.067397,-0.09166,-0.016837,-0.067394
2,Lasso,GridSearchCV,0.124965,0.0,-0.178362,-0.054329,-0.151979,-0.178297,-0.054246,-0.155004
3,Lasso,RandomizedSearchCV,0.090612,0.0,-0.178362,-0.054329,-0.151979,-0.178297,-0.054246,-0.155004
4,Ridge,GridSearchCV,1.776364,0.000605,-0.094858,-0.018699,-0.064638,-0.090924,-0.016866,-0.063372
5,Ridge,RandomizedSearchCV,0.330811,0.004121,-0.094632,-0.01872,-0.065038,-0.091368,-0.017162,-0.063287
6,ElasticNet,GridSearchCV,1.554894,0.003723,-0.094379,-0.018774,-0.062791,-0.092482,-0.017815,-0.062251
7,ElasticNet,RandomizedSearchCV,0.282163,0.003723,-0.094366,-0.018787,-0.062444,-0.092734,-0.017918,-0.062167
8,SGDRegressor,GridSearchCV,0.163973,0.000998,-0.098063,-0.019485,-0.067607,-0.097068,-0.018988,-0.067013
9,SGDRegressor,RandomizedSearchCV,0.127485,0.000798,-0.097633,-0.019746,-0.065602,-0.096402,-0.019097,-0.064497


# Generarea rapoartelor

In [24]:
def highlight(data_frame: pd.DataFrame) -> List[str]:
    '''
    Searches the maximum and minumum in a data frame and creates a list with the color of each cell 
    based on the searches. The cells colors are red if they contain a maximum, lime is they contain
    a minimum, transparent otherwise.
    
    :param data_frame: the given data_frame
    
    :return: the list of colors
    '''
    is_max: pd.Series = data_frame == data_frame.max()
    is_min: pd.Series = data_frame == data_frame.min()  
    
    styles: List[str] = ['background-color: lime' if is_min[i] 
              else 'background-color: red' if is_max[i] 
              else '' 
              for i in range(len(data_frame))]
    return styles

In [25]:
def style_data_frame(data_frame: pd.DataFrame):
    """
    Highlights the maximums and minimums in the given dataframe and transforms negative values into positive values,
    changing also the name of the column (removes the "_neg" from its name).
    
    :param data_frame: the dateframe to be styled
    
    :return: the styled dataframe
    """
    for column in data_frame.columns[2:]:
        data_frame[column] = data_frame[column].abs()
        
    for col in data_frame.columns:
        data_frame.rename(columns={col:col.replace("_neg","")}, inplace=True)
        
    return data_frame.style.apply(highlight, subset=data_frame.columns[2:])

In [26]:
def generate_report(data_frame: pd.DataFrame, title: str) -> None:
    """
    Generates a HTML report with the given title and data_frame.
    
    :param dataframe: the data to be written in the report
    :param title: the title of the HMTL page 
    
    :return: Nothing
    """
    html_code: str = "<h1><center>" + title + "</center></h1>" + data_frame.hide_index().render()
    with open("./Reports/" + title + ".html", "w") as file:
        file.write(html_code)

## CPU Computer Hardware

In [27]:
machine_data_styled: pd.io.formats.style.Styler = style_data_frame(machine_data_table)
display(machine_data_styled)

generate_report(machine_data_styled, "CPU Computer Hardware")

Unnamed: 0,Model_name,Search_srategy,fit_time,score_time,test_mean_absolute_error,test_mean_squared_error,test_median_absolute_error,train_mean_absolute_error,train_mean_squared_error,train_median_absolute_error
0,SVR,GridSearchCV,0.052858,0.001597,75.591254,55328.814704,22.793382,65.054634,22537.388389,21.072565
1,SVR,RandomizedSearchCV,0.03012,0.001595,72.801387,26466.247064,23.363063,72.52665,26372.176891,21.924749
2,Lasso,GridSearchCV,0.041564,0.000798,41.382581,6382.713437,25.105926,35.874061,3424.060352,22.963974
3,Lasso,RandomizedSearchCV,0.039094,0.0002,41.382581,6382.713437,25.105926,35.874061,3424.060352,22.963974
4,Ridge,GridSearchCV,0.597796,0.000199,41.756737,6161.458456,24.05009,35.533432,3310.720695,22.962888
5,Ridge,RandomizedSearchCV,0.103053,0.000598,41.741437,6234.030659,23.856883,35.463698,3318.029748,22.755905
6,ElasticNet,GridSearchCV,0.25619,0.0,40.719804,6088.776828,23.53724,35.018865,3663.906375,20.623071
7,ElasticNet,RandomizedSearchCV,0.040621,0.0,41.928784,6241.881147,24.602068,35.548489,3999.476994,20.951418
8,SGDRegressor,GridSearchCV,0.631102,0.0,44.677158,6373.01087,28.934983,37.145343,3279.752912,26.168195
9,SGDRegressor,RandomizedSearchCV,0.51238,0.0,44.802206,6377.647128,28.99212,37.185133,3285.866103,26.184317


## Boston Housing

In [28]:
housing_data_styled: pd.io.formats.style.Styler = style_data_frame(housing_data_table)
display(housing_data_styled)

generate_report(housing_data_styled, "Boston Housing")

Unnamed: 0,Model_name,Search_srategy,fit_time,score_time,test_mean_absolute_error,test_mean_squared_error,test_median_absolute_error,train_mean_absolute_error,train_mean_squared_error,train_median_absolute_error
0,SVR,GridSearchCV,0.103337,0.001197,3.852212,36.430662,2.37175,3.352318,29.982779,1.958605
1,SVR,RandomizedSearchCV,0.068727,0.0,3.852212,36.430662,2.37175,3.352318,29.982779,1.958605
2,Lasso,GridSearchCV,0.050415,0.0002,5.244676,59.182797,3.328131,4.69381,48.241963,2.897592
3,Lasso,RandomizedSearchCV,0.033448,0.0,5.244676,59.182797,3.328131,4.69381,48.241963,2.897592
4,Ridge,GridSearchCV,0.581108,0.0,3.892439,32.936868,2.891887,3.178637,21.578798,2.247513
5,Ridge,RandomizedSearchCV,0.096852,0.003124,3.892439,32.936868,2.891887,3.178637,21.578798,2.247513
6,ElasticNet,GridSearchCV,0.281191,0.003118,3.890242,32.51612,2.797108,3.222652,22.025292,2.230382
7,ElasticNet,RandomizedSearchCV,0.046864,0.0,4.466425,42.31833,3.238854,3.769753,30.679706,2.654402
8,SGDRegressor,GridSearchCV,0.228078,0.003124,3.709193,31.594458,2.499722,3.232163,22.594661,2.24993
9,SGDRegressor,RandomizedSearchCV,0.159337,0.003125,3.712045,31.138235,2.534461,3.233052,22.571975,2.243376


## Wisconsin Breast Cancer

In [29]:
r_wpbc_data_styled: pd.io.formats.style.Styler = style_data_frame(r_wpbc_data_table)
display(r_wpbc_data_styled)

generate_report(r_wpbc_data_styled, "Wisconsin Breast Cancer")

Unnamed: 0,Model_name,Search_srategy,fit_time,score_time,test_mean_absolute_error,test_mean_squared_error,test_median_absolute_error,train_mean_absolute_error,train_mean_squared_error,train_median_absolute_error
0,SVR,GridSearchCV,0.049546,0.001197,32.241031,1468.057133,28.09736,23.583796,889.788593,21.349809
1,SVR,RandomizedSearchCV,0.029698,0.001809,32.087469,1444.508609,28.740919,24.464151,931.072517,22.093609
2,Lasso,GridSearchCV,0.033445,0.00333,31.634567,1423.42329,29.563356,27.007991,1043.746477,24.823834
3,Lasso,RandomizedSearchCV,0.031243,0.0,31.634567,1423.42329,29.563356,27.007991,1043.746477,24.823834
4,Ridge,GridSearchCV,0.474734,0.0,28.921968,1206.37708,27.624639,23.508781,814.23909,21.260202
5,Ridge,RandomizedSearchCV,0.071858,0.003124,28.921968,1206.37708,27.624639,23.508781,814.23909,21.260202
6,ElasticNet,GridSearchCV,0.319182,0.0,29.230011,1233.986002,27.516844,24.767572,880.007362,22.285832
7,ElasticNet,RandomizedSearchCV,0.046864,0.003124,29.751248,1249.114374,28.394692,25.056995,896.912825,23.148582
8,SGDRegressor,GridSearchCV,0.124964,0.003124,28.844975,1193.453498,27.254648,24.415004,881.294906,21.513568
9,SGDRegressor,RandomizedSearchCV,0.090615,0.0,28.982762,1204.702427,27.5033,24.492466,886.134102,21.799197


## Communities and Crime

In [30]:
communities_data_styled: pd.io.formats.style.Styler = style_data_frame(communities_data_table)
display(communities_data_styled)

generate_report(communities_data_styled, "Communities and Crime")

Unnamed: 0,Model_name,Search_srategy,fit_time,score_time,test_mean_absolute_error,test_mean_squared_error,test_median_absolute_error,train_mean_absolute_error,train_mean_squared_error,train_median_absolute_error
0,SVR,GridSearchCV,2.073046,0.021864,0.096783,0.019204,0.067397,0.09166,0.016837,0.067394
1,SVR,RandomizedSearchCV,1.609014,0.021869,0.096783,0.019204,0.067397,0.09166,0.016837,0.067394
2,Lasso,GridSearchCV,0.124965,0.0,0.178362,0.054329,0.151979,0.178297,0.054246,0.155004
3,Lasso,RandomizedSearchCV,0.090612,0.0,0.178362,0.054329,0.151979,0.178297,0.054246,0.155004
4,Ridge,GridSearchCV,1.776364,0.000605,0.094858,0.018699,0.064638,0.090924,0.016866,0.063372
5,Ridge,RandomizedSearchCV,0.330811,0.004121,0.094632,0.01872,0.065038,0.091368,0.017162,0.063287
6,ElasticNet,GridSearchCV,1.554894,0.003723,0.094379,0.018774,0.062791,0.092482,0.017815,0.062251
7,ElasticNet,RandomizedSearchCV,0.282163,0.003723,0.094366,0.018787,0.062444,0.092734,0.017918,0.062167
8,SGDRegressor,GridSearchCV,0.163973,0.000998,0.098063,0.019485,0.067607,0.097068,0.018988,0.067013
9,SGDRegressor,RandomizedSearchCV,0.127485,0.000798,0.097633,0.019746,0.065602,0.096402,0.019097,0.064497


# Documentarea modelelor

## Epsilon-Support Vector Regression

class `sklearn.svm.SVR` <i>(*, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1)</i>

Scopul lui ε-SVR este de a estima o funcție, fiind constrâns ca estimarea de către funcție a fiecărei intrări să aibă o deviație de cel mult ε față de intrare, deci va penaliza predicțiile care sunt mai depărtate de ε decât rezultatul așteptat. În cazul unei funcții liniare $f$, aceasta va avea forma:

$$\large y = w^\top X + b$$

ε-SVR găsește o aproximație a funcției $f$ prin identificarea unui "tub insensibil la epsilon cât mai plat", cu alte cuvinte, caută ponderi cât mai mici. În final, funcția care va aproxima $f $va avea următoarea formă: 

$$\large \min_w \frac{1}{2} \left \lVert w \right \rVert^2, \text{unde} 
     \begin{cases}
       y_i - w^\top X_i - b \leq \epsilon \\
       w^\top X_i + b - y_i \leq \epsilon \\ 
     \end{cases}$$
     
<img src = "./images/svr.png"></img>

În domeniul învățării automate, un algoritm de regresie a vectorilor de suport poate fi, în unele cazuri, mai potrivit pentru probleme de regresie decât alți algoritmi obișnuiți și populari. Mai jos sunt cazurile în care o regresie a vectorului suport este avantajoasă față de alți algoritmi de regresie:
<ol>
<li>SVM este eficient din punct de vedere al memoriei, ceea ce înseamnă că este nevoie de o cantitate relativ mai mică de resurse de calcul pentru a instrui modelul. Acest lucru se datorează faptului că prezentarea soluției prin intermediul unui subset mic de puncte de instruire oferă avantaje de calcul enorme.</li>
<li>Există relații neliniare sau complexe între caracteristici și etichete. Acest lucru se datorează faptului că avem opțiunea de a converti relațiile neliniare la probleme cu dimensiuni superioare în cazul regresiei vectorului suport.</li>
</ol>

Există trei implementări diferite ale regresiei vectoriale de suport: SVR, NuSVR și LinearSVR. LinearSVR oferă o implementare mai rapidă decât SVR, dar are în vedere doar nucleul liniar, în timp ce NuSVR implementează o formulare ușor diferită de SVR și LinearSVR. 

### Bibliografie
    
#### https://en.wikipedia.org/wiki/Support-vector_machine#Regression)
#### https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR
#### https://heartbeat.fritz.ai/support-vector-regression-in-python-using-scikit-learn-89cc18e933b7

## Lasso

class `sklearn.linear_model.Lasso` <i>(alpha=1.0, *, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')</i> 

<b>LASSO</b> înseamnă „Operator de selecție și contracție cel mai puțin absolut”.

Regresia lazo este o extensie a regresiei liniare care adaugă o penalizare de regularizare funcției de pierdere în timpul antrenamentului. Efectuează regularizarea L1, adică adaugă penalizare echivalentă cu valoarea absolută a magnitudinii coeficienților.

Obiectivul de optimizare pentru Lasso este:
     
$$\large \frac{1}{2 \cdot \text{number of samples}} \cdot \left\lVert y - Xw \right\rVert^2 + \alpha \cdot \left\lVert w \right \rVert_1 $$

Lasso  este  un  model  liniar  care  estimează  coeficienții  rari.  Este  util  în  unele  contexte  datorită  tendinței  sale  de  a  prefera  soluții  cu  mai  puțini  coeficienți  diferiți  de  zero,  reducând  efectiv  numărul  de  caracteristici  de  care  depinde  soluția  dată.  Din  acest  motiv,  Lasso  și  variantele  sale  sunt  fundamentale  pentru  câmpul  de  detectare  comprimat.  În  anumite  condiții,  poate  recupera  setul  exact  de  coeficienți  diferiți  de  zero.

Regresia  lasso  este  un  tip  de  regresie  liniară  care  folosește  contracția.
Reducerea  este  locul  în  care  valorile  datelor  sunt  micșorate  către  un  punct  central,  cum  ar  fi  media.

Procedura  lasso  încurajează  modele  simple,  rare  (adică  modele  cu  mai  puțini  parametri).  Acest  tip  special  de  regresie  este  potrivit  pentru  modelele  care  prezintă  niveluri  ridicate  de  muticoliniaritate  sau  când  doriți  să  automatizați  anumite  părți  ale  selecției  modelului,  cum  ar  fi  selecția  variabilă  /  eliminarea  parametrilor.


<img src = "./images/lasso.png"></img>


### Bibliografie
    
#### https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
#### https://www.statisticshowto.com/lasso-regression/

## Ridge

class `sklearn.linear_model.Ridge` <i>(alpha=1.0, *, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None)</i>

Acest model rezolvă un model de regresie în care funcția de pierdere este funcția liniară a celor mai mici pătrate și regularizarea este dată de norma l2. Cunoscută și sub denumirea de Ridge Regression sau Tikhonov regularization. 
Acest estimator are suport încorporat pentru regresia multi-variabilă (adică, când y este o matrice 2d de formă (n_samples, n_targets)).

Regresia Ridge este o metodă de estimare a coeficienților modelelor cu regresie multiplă în scenarii în care variabilele independente sunt puternic corelate.

Regresia Ridge a fost dezvoltată ca o posibilă soluție la imprecizia estimatorilor cu cel mai mic pătrat atunci când modelele de regresie liniară au unele variabile independente multicoliniare (foarte corelate) - prin crearea unui estimator de regresie a crestei (RR). Aceasta oferă o estimare mai precisă a parametrilor de creastă, deoarece varianța și estimatorul pătrat mediu sunt adesea mai mici decât estimatorii cei mai puțin pătrați derivați anterior.

<img src = "./images/ridge.png"></img>
<img src = "./images/rl.png"></img>

### Bibliografie

#### https://scikit-learn.org/stable/modules/linear_model.html#ridge-regression
#### https://en.wikipedia.org/wiki/Ridge_regression


## ElasticNet

class `sklearn.linear_model.ElasticNet` <i>(alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')</i>

În anul 2005, Hui Zou și Trevor Hastie au creat modelul de regresie ElasticNet pentru a-l îmbunătăți pe cel de Lasso. Estimatorul Lasso funcționează cel mai bine atunci când setul de date conține multe trăsături inutile, acestea din urmă fiind eliminate, simplificând funcția obiectiv. În schimb, estimatorul Ridge este deseori utilizat atunci când majoritatea trăsăturilor sunt utile. În cazul unor seturi de date cu foarte multe trăsături, alegerea unui estimator poate fi o problemă, deoarece nu putem cunoaște toate trăsăturile și nu le putem estima importanța. ElasticNet rezolvă problema, oferind regularizare flexibilă (`l1_ratio` în `sklearn`) și profitând de avantajele fiecărui regresor. 

ElasticNet extinde Lasso adăugându-se încă un termen de penalizare L2 funcției obiectiv, combinând astfel penalizările L1 și L2 ale regresorilor Lasso și Ridge. Practic, Scikit-learn minimizează următoarea funcție:

$$\large \frac{1}{2 \cdot \text{number of samples}} \cdot \left\lVert y - Xw \right\rVert^2_2 + \alpha \cdot \text{L1 ratio} \cdot \left\lVert w \right \rVert_1 + 0.5 \cdot \alpha \cdot (1 - \text{L1 ratio}) \cdot \left\lVert w \right \rVert^2_2,$$

În statistici și, particular, în potrivirea modelelor de regresie liniară sau logistică, elastic net este o metodă de regresie regularizată care combină liniar penalitățile L1 și L2 ale metodelor Lasso și Ridge. Insa metoda elastic net depaseste limitele metodei Lasso (cel mai mic operator de contractie si selectie absoluta), care utilizeaza o functie de penalizare.

Utilizarea acestei funcții de penalizare are mai multe limitări:

De exemplu, în cazul „p mare, mic n” (date cu dimensiuni ridicate, cu câteva exemple), LASSO selectează cel mult n variabile înainte de a se satura. De asemenea, dacă există un grup de variabile foarte corelate, atunci LASSO tinde să selecteze o variabilă dintr-un grup și să le ignore pe celelalte. Pentru a depăși aceste limitări, elastic net adaugă o parte pătratică la penalizare, care atunci când este utilizată singură este o regresie Ridge.

<img src = "./images/lren.png"></img>

### Bibliografie
 
#### https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
#### https://en.wikipedia.org/wiki/Elastic_net_regularization


## SGDRegressor

class `sklearn.linear_model.SGDRegressor` <i>(loss='squared_loss', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False)</i>
SGD înseamnă Stochastic Gradient Descent: gradientul pierderii este estimat la fiecare eșantion la un moment dat și modelul este actualizat pe parcurs cu un program de rezistență descrescător (alias rata de învățare).

Pașii algoritmului sunt: 
<ol>
<li>Găsiți panta funcției obiective în raport cu fiecare parametru / caracteristică. Cu alte cuvinte, calculați gradientul funcției.</li>
<li>Alegeți o valoare inițială aleatorie pentru parametri. (Pentru a clarifica, în exemplul parabolei, diferențiați „y” față de „x”. Dacă am avea mai multe caracteristici precum x1, x2 etc., luăm derivata parțială a „y” față de fiecare dintre caracteristici.)</li>
<li>Actualizați funcția de gradient conectând valorile parametrilor.</li>
<li>Calculați dimensiunile pașilor pentru fiecare caracteristică astfel: mărimea pasului = gradient * rata de învățare.</li>
<li>Calculați noii parametri ca: parametri noi = parametri vechi - dimensiunea pasului</li>
<li>Repetați pașii de la 3 la 5 până când gradientul este aproape de 0.</li>
</ol>

SGDRegressor este model liniar adaptat prin minimizarea unei pierderi empirice regularizate cu SGD.

Regularizatorul este o penalizare adăugată funcției de pierdere care micșorează parametrii modelului către vectorul zero folosind fie norma euclidiană pătrată L2, fie norma absolută L1 sau o combinație a ambelor (Elastic Net). Dacă actualizarea parametrului depășește valoarea 0,0 din cauza regulatorului, actualizarea este trunchiată la 0,0 pentru a permite învățarea modelelor rare și pentru a realiza selectarea caracteristicilor online. 

![SegmentLocal](images/sgd.gif "segment")

### Bibliografie

#### https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html
#### https://www.datatechnotes.com/2020/09/regression-example-with-sgdregressor-in-python.html
