Folositi urmatoarele seturi de date:
1. [CPU Computer Hardware](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware); excludeti din dataset coloanele: vendor name, model name, estimated relative performance; se va estima coloana "published relative performance".
1. [Boston Housing](http://archive.ics.uci.edu/ml/machine-learning-databases/housing/)
1. [Wisconsin Breast Cancer](http://www.dcc.fc.up.pt/~ltorgo/Regression/DataSets.html); cautati in panelul din stanga Wisconsin Breast Cancer si urmati pasii din "My personal Notes"
1. [Communities and Crime](http://archive.ics.uci.edu/ml/datasets/communities+and+crime); stergeti primele 5 dimensiuni si trasaturile cu missing values.

In [64]:
import numpy as np
print(f'NumPy version: {np.__version__}')

import pandas as pd
print(f'Pandas version: {pd.__version__}')

import sklearn as sk
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
print(f'Sklearn version: {sk.__version__}')

#NumPy version: 1.19.2
#Pandas version: 1.2.3
#Sklearn version: 0.24.1

NumPy version: 1.19.2
Pandas version: 1.2.3
Sklearn version: 0.24.1


In [65]:
machine: pd.core.frame.DataFrame = pd.read_csv("Data/machine.data", header=None)
x_machine: np.array = machine.iloc[:, 2:8].values
y_machine: np.array = machine.iloc[:, 8].values
y_machine = y_machine.reshape(y_machine.shape[0])

In [66]:
housing: pd.core.frame.DataFrame = pd.read_csv("Data/housing.data", delim_whitespace=True, header=None)
x_housing: np.array = housing.iloc[:, :-1].values
y_housing: np.array = housing.iloc[:, -1].values
y_housing = y_housing.reshape(y_housing.shape[0])

In [67]:
r_wpbc: pd.core.frame.DataFrame = pd.read_csv("Data/r_wpbc.data", header=None)
x_r_wpbc: np.array = r_wpbc.iloc[:, 1:].values
y_r_wpbc: np.array = r_wpbc.iloc[:, 1].values
y_r_wpbc = y_r_wpbc.reshape(y_r_wpbc.shape[0])

In [68]:
communities: pd.core.frame.DataFrame = pd.read_csv("Data/communities.data", header=None)
communities = communities.replace('?', np.nan)    
x_communities: np.array = communities.iloc[:, 1:].drop([3], axis=1).values
y_communities: np.array = communities.iloc[:, 0].values
y_communities = y_communities.reshape(y_communities.shape[0])

imp = SimpleImputer(missing_values = np.nan, strategy="median")
x_communities = imp.fit_transform(x_communities)

In [69]:
def show_metrics_regression(reg, parameter_grid: dict, x: np.array, y: np.array) -> None:
    """
    Shows the metrics(accuracy, precision, f1, recall) of a classifier.
    
    Args:
        clf: a classifier
        x: np.array containning the dataset information
        y: np.array containning the classification of the data
        
    Returns:
        None
    """
    gridsrc = GridSearchCV(estimator=reg, 
            param_grid=parameter_grid, cv=3, return_train_score=True)
    randsrc = RandomizedSearchCV(estimator=reg,
            param_distributions=parameter_grid, n_iter=15, return_train_score=True)
    
    scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error']
    
    scores1 = cross_validate(gridsrc, x, y, cv=5, scoring=scoring, return_train_score=True)
    scores2 = cross_validate(randsrc, x, y, cv=5, scoring=scoring, return_train_score=True)
    
    df1 = pd.DataFrame(data={'train_mean_absolute_error': scores1['train_neg_mean_absolute_error'],
                            'train_mean_squared_error': scores1['train_neg_mean_squared_error'],
                            'train_median_absolute_error': scores1['train_neg_median_absolute_error'],                           
                            'test_mean_absolute_error': scores1['test_neg_mean_absolute_error'],
                            'test_mean_squared_error': scores1['test_neg_mean_squared_error'],
                            'test_median_absolute_error':scores1['test_neg_median_absolute_error']                          
                           })
    df2 = pd.DataFrame(data={'train_mean_absolute_error': scores2['train_neg_mean_absolute_error'],
                            'train_mean_squared_error': scores2['train_neg_mean_squared_error'],
                            'train_median_absolute_error': scores2['train_neg_median_absolute_error'],                           
                            'test_mean_absolute_error':scores2['test_neg_mean_absolute_error'],
                            'test_mean_squared_error':scores2['test_neg_mean_squared_error'],
                            'test_median_absolute_error': scores2['test_neg_median_absolute_error']                           
                           })
    
    result = pd.DataFrame([df1.mean(), df2.mean()])
    result.insert(0, 'Model_name', [reg, reg])
    result.insert(1, 'Search_strategy', ['GridSearchCV', 'RandomizedSearchCV'])
    return result

In [70]:
parameter_grid_SGDRegressor = {
    'max_iter':[10000],
    'loss': ['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive'],
    'penalty' : ['l1', 'l2'],
    'alpha' : [0.001, 0.01, 0.1, 1]
}
parameter_grid_RandomForestRegressor = {
    'max_depth': [80, 90, 100],
    'min_samples_split': [8, 10, 12, 14],
    'n_estimators': [100, 200],
    'n_jobs':[-1]
}

In [71]:
print("Machine_Dataset:")
df1 = show_metrics_regression(SGDRegressor(), 
        parameter_grid_SGDRegressor, x_machine, y_machine)
df2 = show_metrics_regression(RandomForestRegressor(),
        parameter_grid_RandomForestRegressor, x_machine, y_machine)

df3 = show_metrics_regression(SGDRegressor(), 
    parameter_grid_SGDRegressor, x_machine, y_machine)
df4 = show_metrics_regression(SGDRegressor(), 
    parameter_grid_SGDRegressor, x_machine, y_machine)
df5 = show_metrics_regression(SGDRegressor(), 
    parameter_grid_SGDRegressor, x_machine, y_machine)

display(pd.concat([df1, df2, df3, df4, df5], axis=0, ignore_index=True))

Machine_Dataset:


Unnamed: 0,Model_name,Search_strategy,train_mean_absolute_error,train_mean_squared_error,train_median_absolute_error,test_mean_absolute_error,test_mean_squared_error,test_median_absolute_error
0,SGDRegressor(),GridSearchCV,-4518.439283,-88081730.0,-2938.248058,-4141.332374,-68114610.0,-2812.269716
1,SGDRegressor(),RandomizedSearchCV,-9356.602132,-266752300.0,-6422.02904,-10226.444493,-315716100.0,-7865.90309
2,RandomForestRegressor(),GridSearchCV,-17.521812,-1440.166,-7.435922,-34.104718,-6741.475,-17.328763
3,RandomForestRegressor(),RandomizedSearchCV,-17.638221,-1513.047,-7.073661,-35.403982,-7516.754,-17.988717
4,SGDRegressor(),GridSearchCV,-10833.48471,-825067300.0,-7155.094729,-12780.7359,-777778900.0,-12011.998966
5,SGDRegressor(),RandomizedSearchCV,-10134.751432,-357241100.0,-7024.523878,-11207.842129,-402408900.0,-8367.75441
6,SGDRegressor(),GridSearchCV,-23366.700956,-3148108000.0,-14850.997995,-16537.107597,-860314400.0,-15639.741163
7,SGDRegressor(),RandomizedSearchCV,-12850.226912,-416337500.0,-8411.413911,-12507.089772,-422732800.0,-7597.08103
8,SGDRegressor(),GridSearchCV,-3392.963766,-65375630.0,-2120.748315,-2153.670052,-14889560.0,-1193.394043
9,SGDRegressor(),RandomizedSearchCV,-26149.309344,-2318002000.0,-16966.686531,-23215.508007,-1251650000.0,-17626.718813
