Folositi urmatoarele seturi de date:
1. [CPU Computer Hardware](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware); excludeti din dataset coloanele: vendor name, model name, estimated relative performance; se va estima coloana "published relative performance".
1. [Boston Housing](http://archive.ics.uci.edu/ml/machine-learning-databases/housing/)
1. [Wisconsin Breast Cancer](http://www.dcc.fc.up.pt/~ltorgo/Regression/DataSets.html); cautati in panelul din stanga Wisconsin Breast Cancer si urmati pasii din "My personal Notes"
1. [Communities and Crime](http://archive.ics.uci.edu/ml/datasets/communities+and+crime); stergeti primele 5 dimensiuni si trasaturile cu missing values.

In [274]:
import numpy as np
print(f'NumPy version: {np.__version__}')

import pandas as pd
print(f'Pandas version: {pd.__version__}')

import sklearn as sk
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
print(f'Sklearn version: {sk.__version__}')

#NumPy version: 1.19.2
#Pandas version: 1.2.3
#Sklearn version: 0.24.1

NumPy version: 1.20.1
Pandas version: 1.2.3
Sklearn version: 0.24.1


In [275]:
machine: pd.core.frame.DataFrame = pd.read_csv("Data/machine.data", header=None)
x_machine: np.array = machine.iloc[:, 2:8].values
y_machine: np.array = machine.iloc[:, 8].values
y_machine = y_machine.reshape(y_machine.shape[0])

In [276]:
housing: pd.core.frame.DataFrame = pd.read_csv("Data/housing.data", delim_whitespace=True, header=None)
x_housing: np.array = housing.iloc[:, :-1].values
y_housing: np.array = housing.iloc[:, -1].values
y_housing = y_housing.reshape(y_housing.shape[0])

In [277]:
r_wpbc: pd.core.frame.DataFrame = pd.read_csv("Data/r_wpbc.data", header=None)
x_r_wpbc: np.array = r_wpbc.iloc[:, 1:].values
y_r_wpbc: np.array = r_wpbc.iloc[:, 1].values
y_r_wpbc = y_r_wpbc.reshape(y_r_wpbc.shape[0])

In [278]:
communities: pd.core.frame.DataFrame = pd.read_csv("Data/communities.data", header=None)
communities = communities.replace('?', np.nan)    
x_communities: np.array = communities.iloc[:, 1:].drop([3], axis=1).values
y_communities: np.array = communities.iloc[:, 0].values
y_communities = y_communities.reshape(y_communities.shape[0])

imp = SimpleImputer(missing_values = np.nan, strategy="median")
x_communities = imp.fit_transform(x_communities)

In [279]:
def show_metrics_regression(reg, parameter_grid: dict, x: np.array, y: np.array) -> None:
    """
    Shows the metrics(accuracy, precision, f1, recall) of a classifier.
    
    Args:
        clf: a classifier
        x: np.array containning the dataset information
        y: np.array containning the classification of the data
        
    Returns:
        None
    """
    src = GridSearchCV(estimator = reg, 
            param_grid=parameter_grid, cv=3, return_train_score=True)
    
    scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error']
    scores = cross_validate(src, x, y, cv=5, scoring=scoring, return_train_score=True)
    df = pd.DataFrame(data={'train_mean_absolute_error': scores['train_neg_mean_absolute_error'],
                            'train_mean_squared_error': scores['train_neg_mean_squared_error'],
                            'train_median_absolute_error': scores['train_neg_median_absolute_error'],                           
                            'test_mean_absolute_error': scores['test_neg_mean_absolute_error'],
                            'test_mean_squared_error': scores['test_neg_mean_squared_error'],
                            'test_median_absolute_error': scores['test_neg_median_absolute_error'],                           
                           })
    df.loc['mean'] = df.mean()
    #with pd.option_context('expand_frame_repr', False):
    print (df)

In [280]:
print("Metrics for SGDRegressor:")
parameter_grid_SGDRegressor = {'max_iter':[2000],
            'loss': ['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive']}
print("\nMachine_Dataset:")
show_metrics_regression(SGDRegressor(), parameter_grid_SGDRegressor, x_machine, y_machine);
print("\nWPBC_Dataset:")
show_metrics_regression(SGDRegressor(), parameter_grid_SGDRegressor, x_r_wpbc, y_r_wpbc);
print("\nHousing_Dataset:")
show_metrics_regression(SGDRegressor(), parameter_grid_SGDRegressor, x_housing, y_housing);
print("\nCommunities_Dataset:")
show_metrics_regression(SGDRegressor(), parameter_grid_SGDRegressor, x_communities, y_communities);

Metrics for SGDRegressor:

Machine_Dataset:
      train_mean_absolute_error  train_mean_squared_error  \
0                 -18743.622230             -6.552600e+08   
1                  -9006.914201             -1.625023e+08   
2                 -36259.311146             -2.662193e+09   
3                  -6639.252161             -9.938287e+07   
4                  -5518.129658             -5.745560e+07   
mean              -15233.445879             -7.273587e+08   

      train_median_absolute_error  test_mean_absolute_error  \
0                   -13425.696922             -23656.240652   
1                    -5603.727098              -5289.994570   
2                   -24031.548401             -31413.671054   
3                    -4563.821870              -7610.564811   
4                    -3991.880113              -8170.894850   
mean                -10323.334881             -15228.273187   

      test_mean_squared_error  test_median_absolute_error  
0               -1.337825e