Folositi urmatoarele seturi de date:
1. [CPU Computer Hardware](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware); excludeti din dataset coloanele: vendor name, model name, estimated relative performance; se va estima coloana "published relative performance".
1. [Boston Housing](http://archive.ics.uci.edu/ml/machine-learning-databases/housing/)
1. [Wisconsin Breast Cancer](http://www.dcc.fc.up.pt/~ltorgo/Regression/DataSets.html); cautati in panelul din stanga Wisconsin Breast Cancer si urmati pasii din "My personal Notes"
1. [Communities and Crime](http://archive.ics.uci.edu/ml/datasets/communities+and+crime); stergeti primele 5 dimensiuni si trasaturile cu missing values.

In [258]:
import numpy as np
print(f'NumPy version: {np.__version__}')

import pandas as pd
print(f'Pandas version: {pd.__version__}')

import sklearn as sk
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
print(f'Sklearn version: {sk.__version__}')

#NumPy version: 1.19.2
#Pandas version: 1.2.3
#Sklearn version: 0.24.1

NumPy version: 1.20.1
Pandas version: 1.2.3
Sklearn version: 0.24.1


In [259]:
machine: pd.core.frame.DataFrame = pd.read_csv("Data/machine.data", header=None)
x_machine: np.array = machine.iloc[:, 2:8].values
y_machine: np.array = machine.iloc[:, 8].values
y_machine = y_machine.reshape(y_machine.shape[0])

In [260]:
housing: pd.core.frame.DataFrame = pd.read_csv("Data/housing.data", delim_whitespace=True, header=None)
x_housing: np.array = housing.iloc[:, :-1].values
y_housing: np.array = housing.iloc[:, -1].values
y_housing = y_housing.reshape(y_housing.shape[0])

In [261]:
r_wpbc: pd.core.frame.DataFrame = pd.read_csv("Data/r_wpbc.data", header=None)
x_r_wpbc: np.array = r_wpbc.iloc[:, 1:].values
y_r_wpbc: np.array = r_wpbc.iloc[:, 1].values
y_r_wpbc = y_r_wpbc.reshape(y_r_wpbc.shape[0])

In [262]:
communities: pd.core.frame.DataFrame = pd.read_csv("Data/communities.data", header=None)
communities = communities.replace('?', np.nan)    
x_communities: np.array = communities.iloc[:, 1:].drop([3], axis=1).values
y_communities: np.array = communities.iloc[:, 0].values
y_communities = y_communities.reshape(y_communities.shape[0])

imp = SimpleImputer(missing_values = np.nan, strategy="median")
x_communities = imp.fit_transform(x_communities)

In [263]:
def show_metrics_regression(reg, parameter_grid: dict, x: np.array, y: np.array) -> None:
    """
    Shows the metrics(accuracy, precision, f1, recall) of a classifier.
    
    Args:
        clf: a classifier
        x: np.array containning the dataset information
        y: np.array containning the classification of the data
        
    Returns:
        None
    """
    gridsrc = GridSearchCV(estimator=reg, 
            param_grid=parameter_grid, cv=3, return_train_score=True)
    randsrc=RandomizedSearchCV(estimator=reg, param_distributions=parameter_grid, n_iter=15, return_train_score=True)
    scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error']
    scores1 = cross_validate(gridsrc, x, y, cv=5, scoring=scoring, return_train_score=True)
    scores2 = cross_validate(randsrc, x, y, cv=5, scoring=scoring, return_train_score=True)
    df1 = pd.DataFrame(data={'train_mean_absolute_error': scores1['train_neg_mean_absolute_error'],
                            'train_mean_squared_error': scores1['train_neg_mean_squared_error'],
                            'train_median_absolute_error': scores1['train_neg_median_absolute_error'],                           
                            'test_mean_absolute_error': scores1['test_neg_mean_absolute_error'],
                            'test_mean_squared_error': scores1['test_neg_mean_squared_error'],
                            'test_median_absolute_error':scores1['test_neg_median_absolute_error']                          
                           })
    df2 = pd.DataFrame(data={'train_mean_absolute_error': scores2['train_neg_mean_absolute_error'],
                            'train_mean_squared_error': scores2['train_neg_mean_squared_error'],
                            'train_median_absolute_error': scores2['train_neg_median_absolute_error'],                           
                            'test_mean_absolute_error':scores2['test_neg_mean_absolute_error'],
                            'test_mean_squared_error':scores2['test_neg_mean_squared_error'],
                            'test_median_absolute_error': scores2['test_neg_median_absolute_error']                           
                           })
    #with pd.option_context('expand_frame_repr', False):
    return pd.concat([df1.mean(),df2.mean()])

In [264]:
parameter_grid_SGDRegressor = {'max_iter':[10000],
            'loss': ['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive'],
            'penalty' : ['l1', 'l2'],
            'alpha' : [0.001, 0.01, 0.1, 1]}
parameter_grid_RandomForestRegressor = {
    'max_depth': [80, 90, 100],
    'min_samples_split': [8, 10, 12,14],
    'n_estimators': [100, 200],
    'n_jobs':[-1]
}

In [265]:
print("\nMachine_Dataset:")
print("Metrics for SGDRegressor:")
display(show_metrics_regression(SGDRegressor(), parameter_grid_SGDRegressor, x_machine, y_machine))
print("Metrics for RandomForestRegressor:")
display(show_metrics_regression(RandomForestRegressor(), parameter_grid_RandomForestRegressor, x_machine, y_machine))
# print("\nWPBC_Dataset:")
# show_metrics_regression(SGDRegressor(), parameter_grid_SGDRegressor, x_r_wpbc, y_r_wpbc);
# print("\nHousing_Dataset:")
# show_metrics_regression(SGDRegressor(), parameter_grid_SGDRegressor, x_housing, y_housing);
# print("\nCommunities_Dataset:")
# show_metrics_regression(SGDRegressor(), parameter_grid_SGDRegressor, x_communities, y_communities);


Machine_Dataset:
Metrics for SGDRegressor:


train_mean_absolute_error     -7.799862e+03
train_mean_squared_error      -2.245471e+08
train_median_absolute_error   -5.435283e+03
test_mean_absolute_error      -9.493461e+03
test_mean_squared_error       -3.750837e+08
test_median_absolute_error    -6.569526e+03
train_mean_absolute_error     -5.460299e+03
train_mean_squared_error      -1.223301e+08
train_median_absolute_error   -3.714240e+03
test_mean_absolute_error      -5.985722e+03
test_mean_squared_error       -1.650507e+08
test_median_absolute_error    -3.544712e+03
dtype: float64

Metrics for RandomForestRegressor:


KeyboardInterrupt: 

In [None]:
print("Metrics for RandomForestRegressor:")

print("\nMachine_Dataset:")
show_metrics_regression(RandomForestRegressor(), parameter_grid_RandomForestRegressor, x_machine, y_machine);
print("\nWPBC_Dataset:")
show_metrics_regression(RandomForestRegressor(), parameter_grid_RandomForestRegressor, x_r_wpbc, y_r_wpbc);
print("\nHousing_Dataset:")
show_metrics_regression(RandomForestRegressor(), parameter_grid_RandomForestRegressor, x_housing, y_housing);
print("\nCommunities_Dataset:")
show_metrics_regression(RandomForestRegressor(), parameter_grid_RandomForestRegressor, x_communities, y_communities);