Folositi urmatoarele seturi de date:
1. [CPU Computer Hardware](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware); excludeti din dataset coloanele: vendor name, model name, estimated relative performance; se va estima coloana "published relative performance".
1. [Boston Housing](http://archive.ics.uci.edu/ml/machine-learning-databases/housing/)
1. [Wisconsin Breast Cancer](http://www.dcc.fc.up.pt/~ltorgo/Regression/DataSets.html); cautati in panelul din stanga Wisconsin Breast Cancer si urmati pasii din "My personal Notes"
1. [Communities and Crime](http://archive.ics.uci.edu/ml/datasets/communities+and+crime); stergeti primele 5 dimensiuni si trasaturile cu missing values.

In [786]:
import numpy as np
print(f'NumPy version: {np.__version__}')

import pandas as pd
print(f'Pandas version: {pd.__version__}')

import sklearn as sk
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
print(f'Sklearn version: {sk.__version__}')

#NumPy version: 1.19.2
#Pandas version: 1.2.3
#Sklearn version: 0.24.1

NumPy version: 1.20.1
Pandas version: 1.2.3
Sklearn version: 0.24.1


In [787]:
machine: pd.core.frame.DataFrame = pd.read_csv("Data/machine.data", header=None)
x_machine: np.ndarray = machine.iloc[:, 2:8].values
y_machine: np.ndarray = machine.iloc[:, 8].values
y_machine = y_machine.reshape(y_machine.shape[0])

In [788]:
housing: pd.core.frame.DataFrame = pd.read_csv("Data/housing.data", delim_whitespace=True, header=None)
x_housing: np.ndarray = housing.iloc[:, :-1].values
y_housing: np.ndarray = housing.iloc[:, -1].values
y_housing = y_housing.reshape(y_housing.shape[0])

In [789]:
r_wpbc: pd.core.frame.DataFrame = pd.read_csv("Data/r_wpbc.data", header=None)
x_r_wpbc: np.ndarray = r_wpbc.iloc[:, 1:].values
y_r_wpbc: np.ndarray = r_wpbc.iloc[:, 1].values
y_r_wpbc = y_r_wpbc.reshape(y_r_wpbc.shape[0])

In [790]:
communities: pd.core.frame.DataFrame = pd.read_csv("Data/communities.data", header=None)
communities = communities.replace('?', np.nan)    
x_communities: np.ndarray = communities.iloc[:, 1:].drop([3], axis=1).values
y_communities: np.ndarray = communities.iloc[:, 0].values
y_communities = y_communities.reshape(y_communities.shape[0])

imp:sk.impute._base.SimpleImputer = SimpleImputer(missing_values = np.nan, strategy="median")
x_communities = imp.fit_transform(x_communities)

In [791]:
def show_metrics_regression(reg, parameters: dict, x: np.ndarray, y: np.ndarray) -> pd.core.frame.DataFrame:
    """
    Shows the metrics('mean_absolute_error', 'mean_squared_error', 'median_absolute_error') of a regressor.
    
    Args:
        reg: a regressor
        parameters:a dictionary containning the hiperparameters
        x: np.array containning the dataset information
        y: np.array containning the classification of the data
        
    Returns:
        a pandas dataframe with the metrics of a regressor
    """
    gridsrc = GridSearchCV(estimator=reg, 
            param_grid=parameters, cv=3, n_jobs=-1, return_train_score=True)
    randsrc = RandomizedSearchCV(estimator=reg,
            param_distributions=parameters, n_iter=15, n_jobs=-1, return_train_score=True)
    
    scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error']
    
    scores1 = cross_validate(gridsrc, x, y, cv=5, scoring=scoring, return_train_score=True)
    scores2 = cross_validate(randsrc, x, y, cv=5, scoring=scoring, return_train_score=True)
    
    df1 = pd.DataFrame(data={'train_neg_mean_absolute_error': scores1['train_neg_mean_absolute_error'],
                            'train_neg_mean_squared_error': scores1['train_neg_mean_squared_error'],
                            'train_neg_median_absolute_error': scores1['train_neg_median_absolute_error'],
                            'test_neg_mean_absolute_error': scores1['test_neg_mean_absolute_error'],
                            'test_neg_mean_squared_error': scores1['test_neg_mean_squared_error'],
                            'test_neg_median_absolute_error':scores1['test_neg_median_absolute_error']
                           })
    df2 = pd.DataFrame(data={'train_neg_mean_absolute_error': scores2['train_neg_mean_absolute_error'],
                            'train_neg_mean_squared_error': scores2['train_neg_mean_squared_error'],
                            'train_neg_median_absolute_error': scores2['train_neg_median_absolute_error'],
                            'test_neg_mean_absolute_error':scores2['test_neg_mean_absolute_error'],
                            'test_neg_mean_squared_error':scores2['test_neg_mean_squared_error'],
                            'test_neg_median_absolute_error': scores2['test_neg_median_absolute_error']
                           })
    
    result = pd.DataFrame([df1.mean(), df2.mean()])
    result.insert(0, 'Model_name', [reg, reg])
    result.insert(1, 'Search_strategy', ['GridSearchCV', 'RandomizedSearchCV'])
    return result

In [792]:
parameters_SGDRegressor:dict = {
    'max_iter':[10000],
    'loss': ['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive'],
    'penalty' : ['l1', 'l2'],
    'alpha' : [0.001, 0.01, 0.1, 1]
}
parameters_RandomForestRegressor:dict = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 50, 100],
    'n_jobs':[-1]
}
parameters_Lasso:dict = {
    'alpha':[0.01,0.1,1],
    'tol':[0.0001,0.001,0.01,0.1],
    'selection':['cyclic','random']
}
parameters_MLPRegressor:dict = {
    'max_iter':[10000],
    'solver': ['sgd', 'adam','lbfgs'],
    'alpha': [0.001, 0.01, 0.1, 1],
    'tol':[0.0001, 0.001, 0.01]
}

parameters_Ridge:dict = {
    'alpha': [0.001, 0.01, 0.1, 1],
    "fit_intercept": [True, False],
    "solver": ['svd', 'lsqr', 'sag', 'saga']
}

In [793]:
def dataset_regression(name:str, x:np.ndarray, y:np.ndarray):
    df1 = show_metrics_regression(SGDRegressor(), parameters_SGDRegressor, x, y)
    df2 = show_metrics_regression(RandomForestRegressor(),parameters_RandomForestRegressor, x, y)
    df3 = show_metrics_regression(Lasso(), parameters_Lasso, x, y)
    df4 = show_metrics_regression(MLPRegressor(), parameters_MLPRegressor, x, y)
    df5 = show_metrics_regression(Ridge(), parameters_Ridge, x, y)

    df = pd.concat([df1, df2, df3, df4, df5], axis=0, ignore_index=True)
    df.columns.name = name

    return df

In [794]:
def highlight_max(s)->list:    
    """
    Highlight the maximum in a dataframe red for maximum and green for minimum.
    """
    
    max_val:float = s.max()
    min_val:float = s.min()
    return ['background-color: #ff6666' if v==max_val and type(v)
            else 'background-color: #bdfcc2'if v==min_val else '' for v in s]

def finishing(df:pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    """
    Args:
        df:a dataframe
    
    Returns:
        a dataframe with positive numbers and highlights the maximum and minimum.
    """
    df.iloc[:, 2:] = df.iloc[:, 2:].abs()
    df.columns = ['Model_name', 'Search_strategy',
                  'train_mean_absolute_error', 'train_mean_squared_error',
                  'train_median_absolute_error', 'test_mean_absolute_error',
                  'test_mean_squared_error', 'test_median_absolute_error']

    return df.style.apply(highlight_max, subset=df.columns[2:])

In [None]:
df_machine:pd.core.frame.DataFrame = dataset_regression('Machine_Dataset',x_machine,y_machine)
display(df_machine)
display(finishing(df_machine))

In [None]:
df_housing:pd.core.frame.DataFrame = dataset_regression('Housing_Dataset',x_housing,y_housing)
display(df_housing)
display(finishing(df_housing))

In [None]:
df_r_wpbc:pd.core.frame.DataFrame = dataset_regression('WPBC_Dataset',x_r_wpbc,y_r_wpbc)
display(df_r_wpbc)
display(finishing(df_r_wpbc))

In [None]:
df_communities:pd.core.frame.DataFrame = dataset_regression('Communities_Dataset',x_communities,y_communities)
display(df_communities)
display(finishing(df_communities))