In [1]:
"""
ECO930J : Données massives et apprentissage automatique avec applications en économie
TP 

Preprocessing data, compute ML estimators and ouput results table.

15 avril 2022

@author: LALS12039506, AGHN21599906, VAUG30119904, UQAM
"""

# Import relevant libraries
import copy 
import numpy  as np
import pandas as pd          
import os
import random
import time
from statistics import mean

# ML techniques
from sklearn.model_selection import train_test_split
from sklearn.impute          import SimpleImputer
from sklearn.compose         import ColumnTransformer
from sklearn.preprocessing   import OneHotEncoder, MinMaxScaler, StandardScaler

from sklearn.linear_model    import Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, \
                                    ElasticNetCV
from sklearn.tree            import DecisionTreeRegressor
from sklearn.ensemble        import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network  import MLPRegressor

from sklearn.metrics         import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV

# Location/path of project
os.chdir('c:/Users/Poste/Documents/School/Maitrise Economique UQAM/'  \
         'ECO930J-Données massives et apprentissage automatique avec/TP')

### Subfolders to create in project ###
newpath = 'output/'
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
newpath = 'data/'
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
paths = ('data/', 'output/')

#### Due to sklearn documentation, alpha and lambda are reversed in the code compare to project statement.

## Model definition
#### Here we will define two class that will be responsible for data manipulation and compouting ML estimators.

In [2]:
class DataModel:
    """
    Class taking for imput a complete dataset and preprocessing it for ML estimators.
   
    Warnings
    ----------
    The class does no validation for inputs in its initialization or in its methods. 
    It also does not ensure that the steps are performed in the correct order.
    There is no exception handling.
    """
    
    def __init__(self,raw_data, var_names, model_name):
        """
        Initialize an instance of this class.        
        
        Parameters
        ----------
        raw_data   : pandas.DataFrame
                     Complete dataset for manipulation
        var_names  : List(string)
                     Names of the variables in dataset
        model_name : string
                     Name of the dataset   
        """        
        self.model_name = model_name                # Name of the dataset  
        self.var_names  = var_names                 # Names of the variables
         
        self.raw_data   = raw_data                  # Raw dataset(untouched)
        self.data       = copy.deepcopy(raw_data)   # Dataset for manipulations     
        
        self.num_cols         = None                # Numerical variables
        self.dic_cols         = None                # Dichotomic variables
        self.cat_cols         = None                # Categorical variables
        self.one_hot_cat_cols = None                # Dummies variables from categorical
        
        self.y          = pd.DataFrame({'A' : []})  # Outcome variable      
        self.y_test     = pd.DataFrame({'A' : []})  # Test set of y
        self.y_train    = pd.DataFrame({'A' : []})  # Train set of y
        self.y_test_var = None                      # Variance of test set of y
        self.y_name     = None                      # Names of outcome variable
           
        self.X       = pd.DataFrame({'A' : []})     # Predictors variables
        self.X_test  = pd.DataFrame({'A' : []})     # Test set of X
        self.X_train = pd.DataFrame({'A' : []})     # Train set of X
        self.X_names = None                         # Names of predictors variables
        
    
    def get_feature_name(self):
        """
        Return
        ----------
        X_names : List(string)
                  Names of predictors variables
        """
        return self.X_names    
    
        
    def check_missing(self):
        """
        Verify if there are missing values in the dataset.
        """
        mat_bool  = self.data.isnull()        
        check     = mat_bool.any(axis=None)        
                
        if check == True:      
            
            num = mat_bool.sum().sum()
            print(self.model_name + f' contain {num} missing values')
        else:
            print(self.model_name + ' don\'t contain missing values')
        
            
    def seperate_num_cat(self):
        """
        Separate variables into categorical, numerical and dichotomous.
        
        Warnings
        ----------
        This method should only be used once by instance. 
        """        
        self.num_cols = [col for col in self.data.columns if \
                         self.data[col].dtype in ['float64','int64']]
        
        self.dic_cols = []
        for n in self.num_cols:
            col  = self.data[n].unique() 
            
            if np.array_equal(col, np.array([0,1])):
                self.dic_cols.append(n)
                self.num_cols.remove(n)
        
        self.cat_cols = [col for col in self.data.columns if \
                         self.data[col].dtype not in ['float64','int64']]
        
            
    def impute_missing_simple(self, num_miss_val=np.NaN, num_strat='mean', 
                               num_fill_val=None, cat_miss_val=None, 
                               cat_strat='most_frequent', cat_fill_val=None):        
        """
        Impute missing values. Use different strategies for numerical
        and categorical variables.
        
        Parameters
        ----------
        Represent sklearn.impute.SimpleImputer parameters.        
        num_ : are for numerical variables.
        cat_ : are for categorical variables.
        
        Warnings
        ----------
        This method should only be used once by instance.
        """        
        if self.num_cols:
            self.data[self.num_cols] = \
                SimpleImputer(missing_values=num_miss_val, strategy=num_strat, 
                              fill_value=num_fill_val).fit_transform(self.data[self.num_cols])
        
        if self.cat_cols:
            self.data[self.cat_cols] = \
                SimpleImputer(missing_values=cat_miss_val, strategy=cat_strat, 
                              fill_value=cat_fill_val).fit_transform(self.data[self.cat_cols])
            
        if self.dic_cols:
            self.data[self.dic_cols] = \
                SimpleImputer(missing_values=cat_miss_val, strategy=cat_strat, 
                              fill_value=cat_fill_val).fit_transform(self.data[self.dic_cols])
           
        
    def transform_cat_col(self):
        """
        Transform categorical variable in dummies using 
        sklearn.preprocessing.OneHotEncoder.  
        
        Warnings
        ----------
        Should only be used after: seperate_num_cat().
        This method should only be used once by instance.
        """ 
        if self.cat_cols:
            encoder = OneHotEncoder(sparse=False, handle_unknown='ignore' 
                                   ).fit(self.data[self.cat_cols])
            self.one_hot_cat_cols = list(encoder.get_feature_names(self.cat_cols))
            self.data[self.one_hot_cat_cols] = encoder.transform(self.data[self.cat_cols])
            
    
    def transform_num_col(self, scaler):
        """
        Transform numerical variables using two strategies.
        
        Parameters
        ----------
        scaler : string
                 Use sklearn.preprocessing.StandardScaler if "StandardScaler"
                 Use sklearn.preprocessing.MinMaxScaler if "MinMaxScaler"
        
        Warnings
        ----------
        Should only be used after: seperate_num_cat().
        This method should only be used once by instance.
        """ 
        if self.num_cols:
            if scaler == "MinMaxScaler":
                self.data[self.num_cols] = \
                MinMaxScaler().fit_transform(self.data[self.num_cols])
                
            elif scaler == "StandardScaler":
                self.data[self.num_cols] = \
                StandardScaler().fit_transform(self.data[self.num_cols])
            else:
                print(scaler + " is not in scaler choice")
                return
    
    
    def drop_untransform_cat(self):
        """
        Drop untransformed categorical variables from the dataset.  
        
        Warnings
        ----------
        Should only be used after: transform_cat_col().
        This method should only be used once by instance.
        """         
        if self.cat_cols:
            self.data      = self.data[self.num_cols + self.one_hot_cat_cols]
            self.var_names = self.num_cols + self.one_hot_cat_cols
            self.cat_cols  = None
                
    
    def create_Y_X(self, y_name):
        """
        Seperate outcome and predictors variables for estimators.
        
        Parameters
        ----------
        y_name : string
                 Outcome variable
        """        
        if y_name not in self.var_names :            
            print(f'Variable {y_name}, is not in the dataset')
            return 
        
        self.y_name  = y_name
        self.y       = self.data[y_name]
        self.X       = self.data.drop([y_name], axis=1)
        self.X_names = list(self.X.columns)
        
      
    def create_Y_X_first(self):
        """
        Seperate outcome and predictors variables for estimators.
        Use first variable in the dataset as outcome variable.
        """        
        self.y_name  = self.data.columns[0]
        self.y       = self.data.iloc[:,0]
        self.X       = self.data.iloc[:,1:]
        self.X_names = list(self.X.columns)
        
    
    def _compute_y_test_var(self):
        """
        Compute variance of the outcome variable in the test subset.
        
        Warnings
        ----------
        Should only be used after spliting the dataset into train and test subsets.        
        """      
        self.y_test_var = self.y_test.var()
    
    
    def split_random(self, size_test):
        """      
        Split dataset into random train and test subsets.
        Use sklearn.model_selection.train_test_split.
        
        Parameters
        ----------
        size_test : int or float
                    If float, should be between 0.0 and 1.0 and represent the proportion 
                    of the dataset to include in the test split. 
                    If int, represents the absolute number of test samples. 
        
        Warnings
        ----------
        Should only be used after creating outcome and predictors variables.
        """        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=size_test)
        
        self._compute_y_test_var()
    
    def split_random_seeded(self, size_test, seed):
        """      
        Split dataset into random train and test subsets using a seed for reproducible output.
        Use sklearn.model_selection.train_test_split.
        
        Parameters
        ----------
        size_test : int or float
                    If float, should be between 0.0 and 1.0 and represent the proportion 
                    of the dataset to include in the test split. 
                    If int, represents the absolute number of test samples.
        seed      : int
                    Seed used.
        
        Warnings
        ----------
        Should only be used after creating outcome and predictors variables.
        """
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=size_test, random_state=seed)
        
        self._compute_y_test_var()
        
    def split_no_shuffle(self, test_size):
        """      
        Split dataset without shuffling int train and test subsets.
        Use sklearn.model_selection.train_test_split.
        
        Parameters
        ----------
        size_test : int or float
                    If float, should be between 0.0 and 1.0 and represent the proportion 
                    of the dataset to include in the test split. 
                    If int, represents the absolute number of test samples.    
        
        Warnings
        ----------
        Should only be used after creating outcome and predictors variables.
        """
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, shuffle = False)     
        
        self._compute_y_test_var()

In [3]:
class MlModel:
    """
    Class taking for imput a DataModel and using different ML estimators
    to make predictions and outputting results.
    
    Warnings
    ----------
    The class does no validation for inputs in its initialization or in its methods. 
    It also does not ensure that the steps are performed in the correct order.
    There is no exception handling.
    """
            
    def __init__(self, DataModel):
        """
        Initialize an instance of this class.        
        
        Parameters
        ----------
        DataModel  : DataModel
                     Complete dataset preprocessed for ML estimators
        """       
        self.DataModel  = DataModel                   # DataModel of instance
        
        self.model_name = self.DataModel.model_name   # Name of model   
        self.var_names  = self.DataModel.var_names    # Names of the variables
        self.data       = self.DataModel.data         # Dataset for estimators
        
        self.y            = self.DataModel.y          # Outcome variable          
        self.y_test       = self.DataModel.y_test     # Test set of y
        self.y_train      = self.DataModel.y_train    # Train set of y
        self.y_train_full = None                      # Train set of y (untouched)
        self.y_test_var   = self.DataModel.y_test_var # Variance of test set of y
        self.y_name       = self.DataModel.y_name     # Names of outcome variable
        
        self.X            = self.DataModel.X          # Predictors variables
        self.X_test       = self.DataModel.X_test     # Test set of X
        self.X_train      = self.DataModel.X_train    # Train set of X
        self.X_train_full = None                      # Train set of X (untouched)
        self.X_names      = self.DataModel.X_names    # Names of predictors variables
        self.X_top_names  = None                      # Names of top predictors variables
      
        self.method       = None        # Selected method
        self.method_type  = None        # Selected method type
        
        self.cv_best_param      = None  # Best parameters chosen by cross-validation
        self.feature_importance = None  # Feature importance of predictors
        self.feature_import_top = None  # Top feature importance of predictors
        self.mse_var            = None  # Mean squared error of prediction / test y variance
        self.mse                = None  # Mean squared error of prediction 
        self.R2                 = None  # Score(R2) of prediction 
        self.alphas             = None  # List of alphas for bagging Lasso        
        
        self.methods = ["lasso_ridge_elastic", "lasso_ridge_CV", "elastic_net_CV",
                        "tree_forest", "neural_network"] # List of method types
        
        
    def get_mse_var(self):
        """
        Return
        ----------
        mse_var : float
                  Mean squared error of prediction
        """
        return self.mse_var
    
            
    def get_cv_best_param(self): 
        """
        Return
        ----------
        cv_best_param : Dictionary
                        Best parameters chosen by cross-validation
        """
        return self.cv_best_param
    
    
    def get_feature_importance(self):
        """
        Return
        ----------
        feature_importance : ndarray of shape (n_features)
                             Feature importance of predictors
        """
        return self.feature_importance
    
    
    def get_feature_importance_top(self):
        """
        Return
        ----------
        feature_import_top : List(float)
                             Top feature importance of predictors
        """
        return self.feature_import_top
        
    
    def get_feature_name(self):
        """
        Return
        ----------
        X_names : List(string)
                  Names of predictors variables
        """
        return self.X_names
    
    
    def get_feature_name_top(self):
        """
        Return
        ----------
        X_top_names : List(string)
                      Names of top predictors variables
        """
        return self.X_top_names
    
    
    def get_bagging_lasso_alphas(self):
        """
        Return
        ----------
        alphas : List(float)
                 List of alphas for bagging Lasso
        """
        return self.alphas
    
    
    def select_lasso(self, alpha=1.0, fit_intercept=True, precompute=False, max_iter=1000, 
                     tol=1e-4, warm_start=False, random_state=None, selection="cyclic"):
        """
        Select lasso as ML estimator.
        
        Parameters
        ----------
        Represent sklearn.linear_model.Lasso parameters.        
        """
        self.method = \
            Lasso(alpha=alpha, fit_intercept=fit_intercept, precompute=precompute, 
                  max_iter=max_iter, tol=tol, warm_start=warm_start, random_state= 
                  random_state, selection=selection)
        
        self.method_type = self.methods[0]
        
        
    def select_lassoCV(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, 
                       precompute='auto', max_iter=1000, tol=1e-4, cv=5, verbose=0, 
                       n_jobs=None, random_state=None, selection="cyclic"):
        """
        Select lasso with built-in cross validation as ML estimator.
        
        Parameters
        ----------
        Represent sklearn.linear_model.LassoCV parameters.        
        """
        self.method = \
            LassoCV(eps=eps, n_alphas=n_alphas, alphas=alphas, fit_intercept=fit_intercept, 
                    precompute=precompute, max_iter=max_iter, tol=tol, cv=cv, verbose= 
                    verbose, n_jobs=n_jobs, random_state=random_state, selection=selection)
        
        self.method_type = self.methods[1]
        
    
    def select_ridge(self, alpha=1.0, fit_intercept=True, max_iter=None, solver='auto', 
                     tol=1e-3, random_state=None):
        """
        Select ridge as ML estimator.
        
        Parameters
        ----------
        Represent sklearn.linear_model.Ridge parameters.        
        """    
        self.method = \
            Ridge(alpha=alpha, fit_intercept=fit_intercept, solver=solver, 
                  max_iter=max_iter, tol=tol, random_state=random_state)
        
        self.method_type = self.methods[0]
        
        
    def select_ridgeCV(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True, gcv_mode='auto', 
                       scoring=None, cv=None):
        """
        Select ridge with built-in cross validation as ML estimator.
        
        Parameters
        ----------
        Represent sklearn.linear_model.RidgeCV parameters.        
        """    
        self.method = \
            RidgeCV(alphas=alphas, fit_intercept=fit_intercept, gcv_mode=gcv_mode, 
                    scoring=scoring, cv=cv)
        
        self.method_type = self.methods[1]
        
        
    def select_elasticNet(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True, 
                          precompute=False, max_iter=1000, tol=1e-4, warm_start=False, 
                          random_state=None, selection="cyclic"):
        """
        Select elastic net as ML estimator.
        
        Parameters
        ----------
        Represent sklearn.linear_model.ElasticNet parameters.        
        """
        self.method = \
            ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, 
                       precompute=precompute, max_iter=max_iter, tol=tol, 
                       warm_start=warm_start, random_state= random_state, 
                       selection=selection)
        
        self.method_type = self.methods[0]
        
        
    def select_elasticNetCV(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, 
                            fit_intercept=True, precompute='auto', max_iter=1000, tol=1e-4, 
                            cv=5, verbose=0, n_jobs=None, random_state=None, 
                            selection="cyclic"):        
        """
        Select elastic net with built-in cross validation as ML estimator.
        
        Parameters
        ----------
        Represent sklearn.linear_model.ElasticNetCV parameters.        
        """ 
        self.method = \
            ElasticNetCV(l1_ratio=l1_ratio, eps=eps, n_alphas=n_alphas, alphas=alphas, 
                         fit_intercept=fit_intercept, precompute=precompute, 
                         max_iter=max_iter, tol=tol, cv=cv, verbose=verbose, n_jobs=n_jobs, 
                         random_state=random_state, selection=selection)
        
        self.method_type = self.methods[2]
        
        
    def select_regression_tree(self, criterion="mse", max_depth=None, max_features=None, 
                               random_state=None, splitter="best", min_samples_leaf=1, 
                               min_samples_split=2):        
        """
        Select regression tree as ML estimator.
        
        Parameters
        ----------
        Represent sklearn.tree.DecisionTreeRegressor parameters.       
        """
        self.method = \
            DecisionTreeRegressor(criterion=criterion, max_depth=max_depth, max_features= 
                                  max_features, random_state=random_state, splitter=splitter,
                                  min_samples_leaf=min_samples_leaf, 
                                  min_samples_split=min_samples_split) 
        
        self.method_type = self.methods[3]
        
        
    def select_boosted_trees(self, loss="ls", learning_rate=0.1, n_estimators=100, 
                             subsample=1.0, criterion="friedman_mse", max_depth=3, 
                             min_samples_leaf=1, min_samples_split=2, random_state=None, 
                             max_features=None, alpha=0.9, verbose=0, warm_start=False, 
                             validation_fraction=0.1, n_iter_no_change=None, tol=1e-4):
        """
        Select boosted trees (regressor gradient boosting) as ML estimator.
        
        Parameters
        ----------
        Represent sklearn.ensemble.GradientBoostingRegressor parameters.       
        """    
        self.method = \
            GradientBoostingRegressor(loss=loss, learning_rate=learning_rate, n_estimators=
                                      n_estimators, criterion=criterion, max_depth=max_depth,
                                      min_samples_leaf=min_samples_leaf, min_samples_split=
                                      min_samples_split, random_state=random_state, 
                                      max_features=max_features, alpha=alpha, verbose=verbose,
                                      subsample=subsample, warm_start=warm_start, tol=tol,
                                      validation_fraction=validation_fraction,
                                      n_iter_no_change=n_iter_no_change) 
        
        self.method_type = self.methods[3]
        
    
    def select_random_forest(self, n_estimators=100, criterion="mse", 
                             max_depth=None, max_features="auto", bootstrap=False, 
                             oob_score=False, n_jobs=None, random_state=None, verbose=0, 
                             max_samples=None, min_samples_leaf=1, min_samples_split=2):        
        """
        Select random forest (regressor) as ML estimator.
        
        Parameters
        ----------
        Represent sklearn.ensemble.RandomForestRegressor parameters.        
        """
        self.method = \
            RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, 
                                 max_depth=max_depth, max_features=max_features, bootstrap= 
                                 bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state = 
                                 random_state, verbose=verbose, max_samples=max_samples, 
                                 min_samples_leaf=min_samples_leaf, min_samples_split= 
                                 min_samples_split) 
        
        self.method_type = self.methods[3]  
        
    
    def select_MLP_regressor(self, hidden_layer_sizes=(100,), activation='relu', solver='adam',
                             alpha=0.0001, learning_rate='constant', learning_rate_init=0.001,
                             max_iter=200, shuffle=True, random_state=None, tol=1e-4, 
                             warm_start=False, early_stopping=False,validation_fraction=0.1, 
                             n_iter_no_change=10, batch_size='auto'):
        """
        Select neural network (multi-layer perceptron regressor) as ML estimator.
        
        Parameters
        ----------
        Represent sklearn.neural_network.MLPRegressor parameters.        
        """        
        self.method = \
            MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, activation=activation,
                         solver=solver, alpha=alpha, learning_rate=learning_rate, 
                         learning_rate_init=learning_rate_init, max_iter=max_iter, shuffle= \
                         shuffle, random_state=random_state, tol=tol, warm_start=warm_start,
                         early_stopping=early_stopping, validation_fraction=validation_fraction,
                         n_iter_no_change=n_iter_no_change, batch_size=batch_size)
        
        self.method_type = self.methods[4]   
        
        
    def _generate_bootstrap_data(self, n=None, frac=None, replace=False, weights=None,
                                 axis=None, ignore_index=False, random_state=None):
        """
        Genrerate a bootstraped dataset for bagging Lasso
        
        Parameters
        ----------
        frac    : float
                  % of orignial dataset to make the new one
        replace : boolean
                  Sampling with replacement if True
                  
        The rest are others pandas.DataFrame.sample parameters.     
        """        
        self.X_train = \
            self.X_train_full.sample(n=n, frac=frac, replace=replace, weights=weights, axis=axis,
                                     ignore_index=ignore_index, random_state=random_state)
        
        self.y_train = \
            self.y_train_full.sample(n=n, frac=frac, replace=replace, weights=weights, axis=axis,
                                     ignore_index=ignore_index, random_state=random_state)
       
    
    def bagging_lasso(self, repetition=200, predefined_sampling=True, print_res=True,
                      n=None, frac=None, replace=False, weights=None, random_state=None,
                      eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
                      precompute='auto', max_iter=1000, tol=1e-4, cv=5, verbose=0, n_jobs=None, 
                      selection="cyclic"):
        """
        Compute a baggin lasso as ML estimator. This estimators iterate a chosen
        number of time. Each iteration it takes a bootstrapped dataset and does a
        LassoCV. Output the mean prediction of all iterations.
        
        Parameters
        ----------
        repetition          : int
                              number of iteration
        predefined_sampling : boolean
                              Use a predefined pattern from a seed for reproducible 
                              output if True. If False, use random seed for each bootstrap.
        print_res           : boolean
                              Print results on console if True
        frac                : float
                              % of orignial dataset to make the new one
        replace             : boolean
                              Sampling with replacement if True
                              
        The rest are others pandas.DataFrame.sample and
        sklearn.linear_model.LassoCV parameters.        
                
        Return
        ----------
        mse     : float
                  Mean MSE
        mse_var : float
                  Mean MSE/y test variance
        R2      : float
                  Mean score(R2)
        alphas  : List(float)
                  List of alphas for each iteration              
        """  
        self.alphas = []
        mse_list    = []
        R2_list     = []
                
        if random_state is not None: 
            new_random_state = random_state + ((repetition + 5)*10)
        else: 
            new_random_state = random.randint(0,999999999)
        
        self.X_train_full = copy.deepcopy(self.X_train)
        self.y_train_full = copy.deepcopy(self.y_train)
        
        for i in range(repetition):
            
            if predefined_sampling:
                new_random_state -= 10
            else:
                new_random_state = random.randint(0,999999999)

            self._generate_bootstrap_data(n=n, frac=frac, replace=replace, weights=weights,
                                          random_state=new_random_state)

            self.select_lassoCV(eps=eps, n_alphas=n_alphas, alphas=alphas, fit_intercept= \
                                fit_intercept, precompute=precompute, max_iter=max_iter,
                                tol=tol, cv=cv, verbose=verbose, n_jobs=n_jobs, 
                                random_state=random_state, selection=selection)
            
            self.simple_fit()
            self.alphas.append(self.cv_best_param.get('alpha'))
            
            mse_var, mse, R2 = self.train_test_error(print_res=False)
            mse_list.append(mse), R2_list.append(R2)
            
        self.mse     = mean(mse_list)
        self.mse_var = self.mse/self.y_test_var
        self.R2      = mean(R2_list)
        
        self.X_train = self.X_train_full
        self.y_train = self.y_train_full
        
        if print_res == True: print(f'MSE/σ2 = {self.mse_var:.5f} | MSE = {self.mse:.5f} | '\
        f'R2(score) = {self.R2:.5f}')
        
        return self.mse, self.mse_var, self.R2, self.alphas
    
        
    def _set_feature_importance(self):
        """
        Store features importance.
        
        Warnings
        ----------
        Should only be used after fitting the model.        
        """        
        met = self.method_type
        if met in self.methods[:3]:
            self.feature_importance = self.method.coef_ 
                
        if met == self.methods[3]:
            self.feature_importance = self.method.feature_importances_ 
            
  
    def compute_top_features(self, num=10):
        """
        Compute and store the top best features from importance
        
        Parameters
        ----------
        num : int
              Number of top features
              
        Warnings
        ----------
        Should only be used after fitting the model.  
        """        
        self.feature_import_top = []
        self.X_top_names        = []
    
        temp_dict = dict(zip(self.X_names, np.absolute(self.feature_importance)))
        temp_list = sorted(temp_dict.items(), key=lambda x: x[1],  reverse=True)
        
        for n in range(min(num, len(self.X_names))):
            self.feature_import_top.append(temp_list[n][1])
            self.X_top_names.append(temp_list[n][0])
            
        
    def simple_fit(self):
        """
        Only using sklearn estimatot.fit(X_train,y_train) to train the model.
        
        Warnings
        ----------
        Should only be used after selecting the model.
        """              
        self.method.fit(self.X_train, self.y_train)
        
        if self.method_type == self.methods[1]:
            self.cv_best_param = {'alpha': self.method.alpha_}
            
        if self.method_type == self.methods[2]:
            self.cv_best_param = {'alpha': self.method.alpha_, 
                                  "l1_ratio": self.method.l1_ratio_}
            
        self._set_feature_importance()
        
        
    def cv_hyperpara_fit(self, param_grid, scoring='neg_mean_squared_error', cv=5, 
                         verbose=0, n_jobs=None, return_train_score=False, replace=True, 
                         print_res=True):
        """
        Cross validate chosen hyperparameters for estimations using grid search.
        Will fit the model with train set and best hyperparameters found.
        
        Parameters
        ----------
        param_grid : dict or list(dict)
                     Dictionary with parameters names (str) as keys and lists of parameter
                     settings to try as values, or a list of such dictionaries, in which 
                     case the grids spanned by each dictionary in the list are explored. 
        replace    : boolean
                     Replace selected estimator with best estimator if True.
        print_res  : boolean
                     Print results on console if True.
        
        The rest are others sklearn.model_selection.GridSearchCV parameters.
        
        Warnings
        ----------
        Should only be used after selecting the model.        
        """        
        met     = self.method_type
        methods = self.methods 
        
        if met == methods[0] or met == methods[3] or met == methods[4]:
        
            grid_search = GridSearchCV(self.method, param_grid, cv=cv, scoring=scoring, 
                                       return_train_score=return_train_score, verbose=verbose, 
                                       n_jobs = n_jobs)   

            grid_search.fit(self.X_train, self.y_train)
            if print_res == True: print(grid_search.best_params_)            
            if replace   == True: self.method = grid_search.best_estimator_

            self.cv_best_param = grid_search.best_params_            
        
        elif met == methods[1] or met == methods[2]:
            print("Method already cross validated") 
            
        self._set_feature_importance()
        

    def train_test_error(self, print_res=True):
        """
        Compute mean squared error, MSE/(y test variance) ans score(R2) with
        test set prediction.
        
        Parameters
        ----------
        print_res : boolean
                    Print results on console if True.
                    
        Return
        ----------
        mse     : float
                  MSE
        mse_var : float
                  MSE/y test variance
        R2      : float
                  Score(R2)
                  
        Warnings
        ----------
        Should only be used after fitting the model.
        """
        self.R2      = self.method.score(self.X_test, self.y_test)
        predict      = self.method.predict(self.X_test)
        self.mse     = mean_squared_error(self.y_test, predict) 
        self.mse_var = self.mse/self.y_test_var

        if print_res == True: print(f'MSE/σ2 = {self.mse_var:.5f} | MSE = {self.mse:.5f} | '\
        f'R2(score) = {self.R2:.5f}')
        return self.mse_var, self.mse, self.R2

## Data imports

In [4]:
try:
    # Abalone
    colnames   = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', \
                    'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
    
    fname      = paths[0] + "abalone.data"
    abalone_raw = pd.read_csv(fname, sep=",", names=colnames)
    
    # Boston Housing    
    fname     = paths[0] + "Boston_Housing.data"
     
    colnames  = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', \
                 'TAX', 'PTRATIO']
    df3       = pd.read_csv(fname, delim_whitespace=True, skiprows=lambda x: x%2 == 1,
                            header=None, names=colnames)
     
    colnames  = ['B', 'LSTAT', 'MEDV']
    df2       = pd.read_csv(fname, delim_whitespace=True, skiprows=lambda x: x%2 == 0,
                            header=None, names=colnames)

    boston_raw = pd.concat([df3, df2], axis=1)
    
    # California Housing   
    fname          = paths[0] + "CAhousing.csv"
    california_raw = pd.read_csv(fname, sep=",")
    
    # US Unemployment Rate (h = 1)
    fname        = paths[0] + "US_UnempRate_h1.csv"
    unemploy_raw = pd.read_csv(fname, sep=",")
    unemploy_raw = unemploy_raw.iloc[: , 1:]
    
    # US Inflation Rate (h = 1)
    fname     = paths[0] + "US_infla_h1.csv"
    infl_raw = pd.read_csv(fname, sep=",")
    infl_raw = infl_raw.iloc[: , 1:]
    
    # White Wine 
    fname     = paths[0] + "winequality-white.csv"
    wine_raw = pd.read_csv(fname, sep=";")
    
    # Fish Toxicity 
    colnames   = ['CIC0', 'SM1_Dz(Z)', 'GATS1i', 'NdsCH', 'NdssC', 'MLOGP', 'reponse(LC50)']
    fname      = paths[0] + "qsar_fish_toxicity.csv"
    fish_raw  = pd.read_csv(fname, sep=";", names=colnames)
    
except FileNotFoundError:
    print(f'File {fname} don\'t exist or are unable to open')

## Data pre-processing

In [5]:
# Pepare names lists for all models
data_sets  = [abalone_raw, boston_raw, california_raw, unemploy_raw, infl_raw, \
              wine_raw, fish_raw]

DataModel_names = ["abalone_data", "boston_data", "california_data", "unemploy_data", \
                   "infl_data", "wine_data", "fish_data"]
MlModel_names   = ["abalone", "boston", "california", "unemploy", "infl", "wine", "fish"]

In [6]:
# Create DataModel for each dataset and check for missing values
i = 0
for n in DataModel_names: 
    globals()[n] = DataModel(data_sets[i], list(data_sets[i]), MlModel_names[i])
    i = i+1

DataModels = [abalone_data, boston_data, california_data, unemploy_data, \
              infl_data, wine_data, fish_data]

for n in DataModels: n.check_missing() 
for n in DataModels: n.seperate_num_cat()

abalone don't contain missing values
boston don't contain missing values
california contain 207 missing values
unemploy don't contain missing values
infl don't contain missing values
wine don't contain missing values
fish don't contain missing values


#### Filling missing values for california

In [7]:
california_data.impute_missing_simple()
california_data.check_missing()

california don't contain missing values


#### Transforming categorical variables in dummies with one hot encoder
#### Transforming numerical variables with standardization

In [8]:
# Preprocessing data:
for n in DataModels: n.transform_cat_col()   
for n in DataModels: n.drop_untransform_cat()
for n in DataModels: n.transform_num_col("StandardScaler")

In [9]:
# Create X and Y datasets  
abalone_data.create_Y_X("Rings")
boston_data.create_Y_X("MEDV")
california_data.create_Y_X("median_house_value")
unemploy_data.create_Y_X("y")
infl_data.create_Y_X("y")
wine_data.create_Y_X("quality")
fish_data.create_Y_X("reponse(LC50)")

In [10]:
# Set the seed for the rest of the estimations
seed = random.randint(0,999999999)
seed = 256579770
print(f'Seed : {seed}')

Seed : 256579770


#### Spliting dataset with random shuffling except for time series dataset

In [11]:
# Split data in train and test set
models_continous = [unemploy_data, infl_data]
size_test = 0.2

for n in DataModels:
    if n in models_continous:
        n.split_no_shuffle(size_test)
    else:
        n.split_random_seeded(size_test, seed) 

In [12]:
# Create MlModel for each DataModel
i = 0
for n in MlModel_names: 
    globals()[n] = MlModel(DataModels[i])
    i = i+1

MlModels = [abalone, boston, california, unemploy, infl, wine, fish]

In [13]:
# Create results lists
methods = ["Lasso", "Ridge", "Elas_Net", "Reg_Tree", "Boost_Trees", 
           "RF", "NN_(5:100)", "NN_(2:5)", "Bag_Lasso"]

list_mse_var    = []
list_hyperpara  = []
list_bag_alphas = []

for n in MlModel_names:
    name = n + "_mse_var"
    temp = globals()[name] = []
    list_mse_var.append(temp)
    
    name = n + "_hyperpara"
    temp = globals()[name] = []
    list_hyperpara.append(temp)    

In [14]:
def store_results():
    """
    Store estimations results in lists
    """
    i=0
    for n in list_mse_var: 
        n.append(MlModels[i].get_mse_var())
        i+=1

    i=0
    for n in list_hyperpara:
        n.append(MlModels[i].get_cv_best_param())
        i+=1

## ML model estimations and predictions

In [15]:
# Lasso cross-validation and prediction
alphas = np.logspace(-5, 1, 50)
alphas_lasso = [1e-8] + alphas.tolist() + np.logspace(0.1, 1, 20).tolist()
for n in MlModels: n.select_lassoCV(max_iter=5e+5, random_state=seed, n_jobs=3, 
                                    alphas=alphas_lasso)
    
print("Lasso\n============")
for n in MlModels:
    print("-----------\n" + n.model_name )
    n.simple_fit()
    print(n.get_cv_best_param())
    n.train_test_error() 
    
store_results()

Lasso
-----------
abalone
{'alpha': 0.0015998587196060573}
MSE/σ2 = 0.45904 | MSE = 0.47993 | R2(score) = 0.54041
-----------
boston
{'alpha': 0.0009102981779915217}
MSE/σ2 = 0.23931 | MSE = 0.24678 | R2(score) = 0.75832
-----------
california


  model = cd_fast.enet_coordinate_descent(


{'alpha': 1e-08}
MSE/σ2 = 0.34614 | MSE = 0.35362 | R2(score) = 0.65378
-----------
unemploy
{'alpha': 0.0625055192527397}
MSE/σ2 = 0.26603 | MSE = 0.35169 | R2(score) = 0.72763
-----------
infl
{'alpha': 0.04714866363457394}
MSE/σ2 = 1.10811 | MSE = 0.80872 | R2(score) = -0.13449
-----------
wine
{'alpha': 0.006551285568595509}
MSE/σ2 = 0.71985 | MSE = 0.78955 | R2(score) = 0.27941
-----------
fish
{'alpha': 1e-08}
MSE/σ2 = 0.44751 | MSE = 0.45126 | R2(score) = 0.55002


In [16]:
# Store Lasso best features
for n in [california, unemploy, infl]: n.compute_top_features()

lasso_california_top_names  = california.get_feature_name_top()
lasso_california_top_values = california.get_feature_importance_top()

lasso_unemploy_top_names    = unemploy.get_feature_name_top()
lasso_unemploy_top_values   = unemploy.get_feature_importance_top()

lasso_infl_top_names        = infl.get_feature_name_top()
lasso_infl_top_values       = infl.get_feature_importance_top()

Lasso_var_import=[lasso_california_top_names, lasso_california_top_values, 
                  lasso_unemploy_top_names, lasso_unemploy_top_values, 
                  lasso_infl_top_names,lasso_infl_top_values] 

In [17]:
# Ridge cross-validation and prediction
alpha_ridge = alphas_lasso + np.logspace(1, 3, 100).tolist()
for n in MlModels: n.select_ridgeCV(alphas=alpha_ridge)
    
print("Ridge\n============")
for n in MlModels:
    print("-----------\n" + n.model_name )
    n.simple_fit()    
    print(n.get_cv_best_param())
    n.train_test_error()
    
store_results()

Ridge
-----------
abalone
{'alpha': 0.7906043210907702}
MSE/σ2 = 0.45584 | MSE = 0.47658 | R2(score) = 0.54362
-----------
boston
{'alpha': 8.040131611167856}
MSE/σ2 = 0.24070 | MSE = 0.24822 | R2(score) = 0.75692
-----------
california
{'alpha': 0.19306977288832497}
MSE/σ2 = 0.34610 | MSE = 0.35358 | R2(score) = 0.65381
-----------
unemploy
{'alpha': 911.1627561154896}
MSE/σ2 = 0.25941 | MSE = 0.34293 | R2(score) = 0.73442
-----------
infl
{'alpha': 162.97508346206433}
MSE/σ2 = 1.23702 | MSE = 0.90281 | R2(score) = -0.26648
-----------
wine
{'alpha': 48.62601580065353}
MSE/σ2 = 0.71757 | MSE = 0.78705 | R2(score) = 0.28170
-----------
fish
{'alpha': 20.09233002565047}
MSE/σ2 = 0.44620 | MSE = 0.44993 | R2(score) = 0.55134


In [18]:
# Elastict Net cross-validation and prediction
list_l1  = [0.001, .05, .1, .2, .5, .7, .8, .9, .925, .95, .975,.99, 1]
alpha_EN = alphas.tolist() + np.logspace(0.1, 1, 20).tolist() + np.logspace(1, 2, 30).tolist()
for n in MlModels: n.select_elasticNetCV(max_iter=1e+6, random_state=seed, n_jobs=3, \
                                          alphas=alpha_EN, l1_ratio=list_l1)
      
print("Elastict Net\n============")
for n in MlModels:
    print("-----------\n" + n.model_name )
    n.simple_fit()    
    print(n.get_cv_best_param())
    n.train_test_error()
    
store_results()

Elastict Net
-----------
abalone
{'alpha': 0.0015998587196060573, 'l1_ratio': 1.0}
MSE/σ2 = 0.45904 | MSE = 0.47993 | R2(score) = 0.54041
-----------
boston
{'alpha': 0.026826957952797246, 'l1_ratio': 0.001}
MSE/σ2 = 0.24139 | MSE = 0.24893 | R2(score) = 0.75622
-----------
california
{'alpha': 1.3257113655901082e-05, 'l1_ratio': 0.001}
MSE/σ2 = 0.34610 | MSE = 0.35358 | R2(score) = 0.65382
-----------
unemploy
{'alpha': 6.46437163249006, 'l1_ratio': 0.001}
MSE/σ2 = 0.27311 | MSE = 0.36105 | R2(score) = 0.72039
-----------
infl
{'alpha': 0.04714866363457394, 'l1_ratio': 1.0}
MSE/σ2 = 1.10811 | MSE = 0.80872 | R2(score) = -0.13449
-----------
wine
{'alpha': 0.01151395399326447, 'l1_ratio': 0.001}
MSE/σ2 = 0.71755 | MSE = 0.78702 | R2(score) = 0.28172
-----------
fish
{'alpha': 0.03556480306223128, 'l1_ratio': 0.001}
MSE/σ2 = 0.44595 | MSE = 0.44969 | R2(score) = 0.55158


In [19]:
# Regression tree cross-validation and prediction
for n in MlModels: n.select_regression_tree(random_state=seed)

depth_list = [i for i in range(1, 100)] 
param_grid = [{'max_depth': depth_list}]

print("Regression tree\n============")
for n in MlModels:
    print("-----------\n" + n.model_name )
    n.cv_hyperpara_fit(param_grid=param_grid, verbose=0, n_jobs=3)
    n.train_test_error()   
    
store_results()

Regression tree
-----------
abalone
{'max_depth': 5}
MSE/σ2 = 0.54285 | MSE = 0.56756 | R2(score) = 0.45650
-----------
boston
{'max_depth': 9}
MSE/σ2 = 0.16404 | MSE = 0.16916 | R2(score) = 0.83433
-----------
california
{'max_depth': 9}
MSE/σ2 = 0.27711 | MSE = 0.28310 | R2(score) = 0.72283
-----------
unemploy
{'max_depth': 4}
MSE/σ2 = 0.66018 | MSE = 0.87274 | R2(score) = 0.32410
-----------
infl
{'max_depth': 3}
MSE/σ2 = 1.16998 | MSE = 0.85387 | R2(score) = -0.19783
-----------
wine
{'max_depth': 4}
MSE/σ2 = 0.72313 | MSE = 0.79314 | R2(score) = 0.27613
-----------
fish
{'max_depth': 4}
MSE/σ2 = 0.55047 | MSE = 0.55508 | R2(score) = 0.44649


In [20]:
# Boosted trees cross-validation and prediction
for n in MlModels: n.select_boosted_trees(random_state=seed)

param_grid = [{'max_depth': [i for i in range(1, 5)],
               'learning_rate': [0.1, 0.05, 0.01, 0.005],
               'n_estimators': [100, 250, 500, 750]}]

print("Boosted trees\n============")
for n in MlModels:
    print("-----------\n" + n.model_name )
    n.cv_hyperpara_fit(param_grid=param_grid, verbose=0, n_jobs=4)
    mse_var, mse, r2 = n.train_test_error()
    
store_results()

Boosted trees
-----------
abalone
{'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 750}
MSE/σ2 = 0.47674 | MSE = 0.49844 | R2(score) = 0.52269
-----------
boston
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 750}
MSE/σ2 = 0.08656 | MSE = 0.08927 | R2(score) = 0.91258
-----------
california
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 750}
MSE/σ2 = 0.15730 | MSE = 0.16070 | R2(score) = 0.84266
-----------
unemploy
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 750}
MSE/σ2 = 0.38156 | MSE = 0.50441 | R2(score) = 0.60936
-----------
infl
{'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 500}
MSE/σ2 = 1.10178 | MSE = 0.80410 | R2(score) = -0.12801
-----------
wine
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 750}
MSE/σ2 = 0.54424 | MSE = 0.59694 | R2(score) = 0.45520
-----------
fish
{'learning_rate': 0.005, 'max_depth': 4, 'n_estimators': 750}
MSE/σ2 = 0.47057 | MSE = 0.47451 | R2(score) = 0.52683


In [21]:
# Random forest cross-validation and prediction
for n in MlModels: n.select_random_forest(n_estimators=500, random_state=seed, n_jobs=4)

param_grid = [{'max_features': [1.0, 1/2, 1/3, "sqrt"]}]

print("Random forest\n============")
for n in MlModels:
    print("-----------\n" + n.model_name )
    n.cv_hyperpara_fit(param_grid=param_grid, verbose=0, n_jobs=None)
    mse_var, mse, r2 = n.train_test_error()
    
store_results()

Random forest
-----------
abalone
{'max_features': 0.3333333333333333}
MSE/σ2 = 0.48589 | MSE = 0.50800 | R2(score) = 0.51353
-----------
boston
{'max_features': 0.3333333333333333}
MSE/σ2 = 0.07624 | MSE = 0.07862 | R2(score) = 0.92300
-----------
california
{'max_features': 0.5}
MSE/σ2 = 0.16566 | MSE = 0.16924 | R2(score) = 0.83430
-----------
unemploy
{'max_features': 'sqrt'}
MSE/σ2 = 0.31520 | MSE = 0.41669 | R2(score) = 0.67729
-----------
infl
{'max_features': 'sqrt'}
MSE/σ2 = 1.23430 | MSE = 0.90082 | R2(score) = -0.26368
-----------
wine
{'max_features': 0.3333333333333333}
MSE/σ2 = 0.45221 | MSE = 0.49600 | R2(score) = 0.54733
-----------
fish
{'max_features': 0.3333333333333333}
MSE/σ2 = 0.46222 | MSE = 0.46610 | R2(score) = 0.53522


In [22]:
# Store Random Forest best features
for n in [california, unemploy, infl]: n.compute_top_features()

RF_california_top_names  = california.get_feature_name_top()
RF_california_top_values = california.get_feature_importance_top()

RF_unemploy_top_names    = unemploy.get_feature_name_top()
RF_unemploy_top_values   = unemploy.get_feature_importance_top()

RF_infl_top_names        = infl.get_feature_name_top()
RF_infl_top_values       = infl.get_feature_importance_top()

RF_var_import=[RF_california_top_names, RF_california_top_values,
               RF_unemploy_top_names, RF_unemploy_top_values,
               RF_infl_top_names,RF_infl_top_values]  

In [23]:
# Neural network (5:100) cross-validation and prediction
layers = (100, 100, 100, 100, 100)
for n in MlModels: n.select_MLP_regressor(hidden_layer_sizes=layers, random_state=seed,
                                          early_stopping=True)

param_grid = [{'n_iter_no_change': [10, 20, 50, 100],
               'learning_rate_init': [0.1, 0.05, 0.01, 0.001]}]

print("Neural network (5:100)\n============")
for n in MlModels:
    print("-----------\n" + n.model_name )
    n.cv_hyperpara_fit(param_grid=param_grid, verbose=0, n_jobs=3)
    mse_var, mse, r2 = n.train_test_error()
    
store_results()

Neural network (5:100)
-----------
abalone
{'learning_rate_init': 0.01, 'n_iter_no_change': 50}
MSE/σ2 = 0.44820 | MSE = 0.46860 | R2(score) = 0.55126
-----------
boston
{'learning_rate_init': 0.001, 'n_iter_no_change': 20}
MSE/σ2 = 0.08638 | MSE = 0.08908 | R2(score) = 0.91276
-----------
california
{'learning_rate_init': 0.001, 'n_iter_no_change': 20}
MSE/σ2 = 0.19171 | MSE = 0.19585 | R2(score) = 0.80825
-----------
unemploy
{'learning_rate_init': 0.01, 'n_iter_no_change': 10}
MSE/σ2 = 0.24945 | MSE = 0.32977 | R2(score) = 0.74461
-----------
infl
{'learning_rate_init': 0.001, 'n_iter_no_change': 50}
MSE/σ2 = 0.94119 | MSE = 0.68690 | R2(score) = 0.03640
-----------
wine
{'learning_rate_init': 0.001, 'n_iter_no_change': 10}
MSE/σ2 = 0.62520 | MSE = 0.68573 | R2(score) = 0.37417
-----------
fish
{'learning_rate_init': 0.01, 'n_iter_no_change': 10}
MSE/σ2 = 0.45896 | MSE = 0.46280 | R2(score) = 0.53851


In [24]:
# Neural network (2:5) cross-validation and prediction
layers = (5, 5)
for n in MlModels: n.select_MLP_regressor(hidden_layer_sizes=layers, random_state=seed,
                                          early_stopping=True, max_iter=1000)

param_grid = [{'n_iter_no_change': [10, 20, 50, 100],
               'learning_rate_init': [0.1, 0.05, 0.01, 0.001]}]

print("Neural network (2:5)\n============")
for n in MlModels:
    print("-----------\n" + n.model_name )
    n.cv_hyperpara_fit(param_grid=param_grid, verbose=0, n_jobs=3)
    mse_var, mse, r2 = n.train_test_error()
    
store_results()

Neural network (2:5)
-----------
abalone
{'learning_rate_init': 0.1, 'n_iter_no_change': 100}
MSE/σ2 = 0.44296 | MSE = 0.46312 | R2(score) = 0.55651
-----------
boston
{'learning_rate_init': 0.01, 'n_iter_no_change': 50}
MSE/σ2 = 0.14165 | MSE = 0.14607 | R2(score) = 0.85695
-----------
california
{'learning_rate_init': 0.01, 'n_iter_no_change': 100}
MSE/σ2 = 0.23282 | MSE = 0.23785 | R2(score) = 0.76712
-----------
unemploy
{'learning_rate_init': 0.001, 'n_iter_no_change': 20}
MSE/σ2 = 0.57572 | MSE = 0.76109 | R2(score) = 0.41057
-----------
infl
{'learning_rate_init': 0.01, 'n_iter_no_change': 10}
MSE/σ2 = 2.75116 | MSE = 2.00785 | R2(score) = -1.81666
-----------
wine
{'learning_rate_init': 0.001, 'n_iter_no_change': 50}
MSE/σ2 = 0.67287 | MSE = 0.73802 | R2(score) = 0.32644
-----------
fish
{'learning_rate_init': 0.05, 'n_iter_no_change': 10}
MSE/σ2 = 0.45061 | MSE = 0.45438 | R2(score) = 0.54690


In [15]:
# Bagging lasso cross-validation and prediction
bagging_alpha = alphas = np.logspace(-6, 1, 280).tolist() + np.logspace(0.1, 1, 20).tolist()
print("Bagging lasso\n============")
for n in MlModels:
    print("-----------\n" + n.model_name )
    n.bagging_lasso(frac=0.85, replace=True, random_state=seed,
                    alphas=bagging_alpha, max_iter=50000, n_jobs=3)
    
    list_bag_alphas.append(n.get_bagging_lasso_alphas())
    
store_results()

Bagging lasso
-----------
abalone
MSE/σ2 = 0.45696 | MSE = 0.47775 | R2(score) = 0.54250
-----------
boston
MSE/σ2 = 0.26489 | MSE = 0.27316 | R2(score) = 0.73249
-----------
california
MSE/σ2 = 0.34694 | MSE = 0.35444 | R2(score) = 0.65298
-----------
unemploy
MSE/σ2 = 0.40394 | MSE = 0.53400 | R2(score) = 0.58644
-----------
infl
MSE/σ2 = 1.32397 | MSE = 0.96626 | R2(score) = -0.35549
-----------
wine
MSE/σ2 = 0.72125 | MSE = 0.79108 | R2(score) = 0.27802
-----------
fish
MSE/σ2 = 0.45349 | MSE = 0.45729 | R2(score) = 0.54401


## Store and save results for analysis

In [26]:
sep  = ","
path = paths[1]

# Bagging Lasso alphas

file_name = "bag_lambda.txt"
fname     = path + file_name

with open(fname, 'w',  encoding="utf-8") as f:
    text = ""
    for n in MlModel_names: 
        text += ("%s" % n) + sep
    text = text[:-len(sep)]
    f.write(text + "\n")
    
    for i in range(len(list_bag_alphas[0])):
        text = ""
        for n in list_bag_alphas:
            text += ("%s" % n[i]) + sep
        text = text[:-len(sep)]
        f.write(text + "\n")
f.close()

# MSE/σ2
file_name = "mse_var.txt"
    
fname = path + file_name

with open(fname, 'w',  encoding="utf-8") as f:
    text = sep
    for n in methods: 
        text += ("%s" % n) + sep
    text = text[:-len(sep)]
    f.write(text + "\n")

    i=0
    for item in list_mse_var:
        text = MlModel_names[i] + sep
        for n in item:        
            text += ("%s" % n) + sep
        text = text[:-len(sep)]
        f.write(text + "\n")
        i+=1
        
# Hyperparameter CV
file_name = "hyperpara.txt"    
fname = path + file_name

hyper_para = ["L_lambda ", "R_lambda ", "EN_lambda ", "EN_alpha ", "RT_depth ", 
              "BT_learn_rate ", "BT_depth ", "BT_#tree ", "RF_mtry ", "NN_5:100_learn_rate ", 
              "NN_5:100_early_stop ", "NN_2:5_learn_rate ", "NN_2:5_early_stop "]

with open(fname, 'w',  encoding="utf-8") as f:
    text = sep
    for n in hyper_para: 
        text += ("%s" % n) + sep
    text = text[:-len(sep)]
    f.write(text + "\n")
    
    i=0
    for item in list_hyperpara:
        text = MlModel_names[i] + sep
        text += ("%s" % item[0].get('alpha')) + sep
        text += ("%s" % item[1].get('alpha')) + sep
        text += ("%s" % item[2].get('alpha')) + sep
        text += ("%s" % item[2].get('l1_ratio')) + sep
        text += ("%s" % item[3].get('max_depth')) + sep
        text += ("%s" % item[4].get('learning_rate')) + sep
        text += ("%s" % item[4].get('max_depth')) + sep
        text += ("%s" % item[4].get('n_estimators')) + sep
        text += ("%s" % item[5].get('max_features')) + sep
        text += ("%s" % item[6].get('learning_rate_init')) + sep
        text += ("%s" % item[6].get('n_iter_no_change')) + sep
        text += ("%s" % item[7].get('learning_rate_init')) + sep
        text += ("%s" % item[7].get('n_iter_no_change')) + sep
        text = text[:-len(sep)]
        f.write(text + "\n")
        i+=1

# Features importances
cols_name = []
for i in range(1,11): cols_name.append("var" + str(i))   
rows_name = ["California_names", "California_values", "unemploy_names", "unemploy_values", 
             "infl_names", "infl_values"]
    
file_name   = "Lasso_coefficients.txt"    
fname_lasso = path + file_name
file_name   = "RF_features_importances.txt" 
fname_RF    = path + file_name 

with open(fname_lasso, 'w',  encoding="utf-8") as a, \
     open(fname_RF, 'w',  encoding="utf-8") as b:
        
    text_lasso = sep
    text_RF    = sep
    for n in cols_name: 
        text_lasso += ("%s" % n) + sep
        text_RF    += ("%s" % n) + sep
        
    text_lasso = text_lasso[:-len(sep)]
    text_RF = text_RF[:-len(sep)]
    a.write(text_lasso + "\n")
    b.write(text_RF + "\n")
    
    for i in range(len(Lasso_var_import)):
        text_lasso = rows_name[i] + sep
        text_RF    = rows_name[i] + sep
        
        for n in Lasso_var_import[i]:        
            text_lasso += ("%s" % n) + sep
        text_lasso = text_lasso[:-len(sep)]
        a.write(text_lasso + "\n")  
        
        for n in RF_var_import[i]:        
            text_RF += ("%s" % n) + sep
        text_RF = text_RF[:-len(sep)]
        b.write(text_RF + "\n") 