In [42]:
import numpy as np
print(f'NumPy version: {np.__version__}')

import pandas as pd
print(f'Pandas version: {pd.__version__}')

import sklearn as sk
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
print(f'Sklearn version: {sk.__version__}')

#NumPy version: 1.20.1
#Pandas version: 1.2.3
#Sklearn version: 0.24.1

NumPy version: 1.20.1
Pandas version: 1.2.3
Sklearn version: 0.24.1


In [43]:
smsdata = pd.read_csv('data/SMSSpamCollection', sep = '\t', names=["label", "sms"])

In [44]:
text_x_sms = dataset["sms"].values
text_y_sms = dataset["label"].values

In [45]:
# enc = OneHotEncoder(handle_unknown='ignore')
# enc.fit(text_Y.reshape(-1,1))
# Y = enc.transform(text_Y.reshape(-1,1)).toarray()

In [46]:
vect = CountVectorizer(max_features=1500, min_df=5, max_df=0.7)
x_sms = vect.fit_transform(text_x_sms).toarray()
y_sms = vect.fit_transform(text_y_sms).toarray()

In [47]:
def show_metrics_regression(reg, parameters: dict, x: np.ndarray, y: np.ndarray) -> pd.core.frame.DataFrame:
    """
    Shows the metrics('mean_absolute_error', 'mean_squared_error', 'median_absolute_error') of a regressor.
    
    Args:
        reg: a regressor
        parameters:a dictionary containning the hiperparameters
        x: np.array containning the dataset information
        y: np.array containning the classification of the data
        
    Returns:
        a pandas dataframe with the metrics of a regressor
    """
    gridsrc = GridSearchCV(estimator=reg, 
            param_grid=parameters, cv=3, n_jobs=-1, return_train_score=True)
    randsrc = RandomizedSearchCV(estimator=reg,
            param_distributions=parameters, n_iter=15, n_jobs=-1, return_train_score=True)
    
    scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error']
    
    scores1 = cross_validate(gridsrc, x, y, cv=5, scoring=scoring, return_train_score=True)    
    
    df1 = pd.DataFrame(data={'train_neg_mean_absolute_error': scores1['train_neg_mean_absolute_error'],
                            'train_neg_mean_squared_error': scores1['train_neg_mean_squared_error'],                            
                            'test_neg_mean_absolute_error': scores1['test_neg_mean_absolute_error'],
                            'test_neg_mean_squared_error': scores1['test_neg_mean_squared_error']                            
                           })    
    
    result = pd.DataFrame([df1.mean()])
    result.insert(0, 'Model_name', [reg])
    result.insert(1, 'Search_strategy', ['GridSearchCV'])
    return result

In [48]:
parameters_SGDRegressor:dict = {
    'max_iter':[10000],
    'loss': ['squared_loss','epsilon_insensitive','squared_epsilon_insensitive'],
    'penalty' : ['l1', 'l2'],
    'alpha' : [0.001, 0.01, 0.1, 1]
}
parameters_RandomForestRegressor:dict = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 50, 100],
    'n_jobs':[-1]
}
parameters_Lasso:dict = {
    'alpha':[0.01,0.1,1],
    'tol':[0.0001,0.001,0.01,0.1],
    'selection':['cyclic','random']
}
parameters_MLPRegressor:dict = {
    'max_iter':[10000],
    'solver': ['sgd', 'adam'],
    'alpha': [0.001, 0.01, 0.1, 1],
    'tol':[0.0001, 0.001, 0.01]
}

In [49]:
def dataset_regression(name:str, x:np.ndarray, y:np.ndarray):
    df1 = show_metrics_regression(SGDRegressor(), parameters_SGDRegressor, x, y)
    df2 = show_metrics_regression(RandomForestRegressor(),parameters_RandomForestRegressor, x, y)
    df3 = show_metrics_regression(Lasso(), parameters_Lasso, x, y)
    df4 = show_metrics_regression(MLPRegressor(), parameters_MLPRegressor, x, y)    

    df = pd.concat([df1, df2, df3, df4], axis=0, ignore_index=True)
    df.columns.name = name

    return df

In [50]:
def highlight_max(s)->list:    
    """
    Highlight the maximum in a dataframe red for maximum and green for minimum.
    """
    
    max_val:float = s.max()
    min_val:float = s.min()
    return ['background-color: #ff6666' if v==max_val and type(v)
            else 'background-color: #bdfcc2'if v==min_val else '' for v in s]

def finishing(df:pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    """
    Args:
        df:a dataframe
    
    Returns:
        a dataframe with positive numbers and highlights the maximum and minimum.
    """
    aux = df.columns.name
    df.iloc[:, 2:] = df.iloc[:, 2:].abs()

    df.columns = ['Model_name', 'Search_strategy',
                  'train_mean_absolute_error', 'train_mean_squared_error',
                   'test_mean_absolute_error','test_mean_squared_error']

    df = df.style.apply(highlight_max, subset=df.columns[2:])
    df.columns.name = aux
    return df

In [51]:
df_smsdata:pd.core.frame.DataFrame = dataset_regression('SMSSpamCollection_Dataset',x_sms,y_sms.reshape(-1))
display(finishing(df_smsdata))

ValueError: Length of values (1) does not match length of index (2)