# Machine Learning Exercise 2 - Regression and AutoML

In [1]:
import time
import math
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Perceptron

import warnings

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 999)

def supress_warnings(suppress):
    if suppress:
        warnings.filterwarnings('ignore') # hide all warnings
    else:
        warnings.filterwarnings('default') # default warnings settings

_print_hill_step = False

In [2]:
def all_lower(val, list):
    """
    returns False if there is an element in list greater val, True otherwise (including list=[])
    """
    return(all(x < val for x in list))

def fit_elastic(params, X_train, y_train):
    elastic_model = ElasticNet(**params)
    elastic_model.fit(X_train, y_train)
    return elastic_model

def fit_tree(params, X_train, y_train):
    tree_model = DecisionTreeRegressor(**params)
    tree_model.fit(X_train, y_train)
    return tree_model

def fit_mlp(params, X_train, y_train):
    mlp_model = MLPRegressor(**params)
    mlp_model.fit(X_train, y_train)
    return mlp_model

def build_generic(fit_fun, params, X_train, X_test, y_train, y_test):
    timing = None
    score = None

    start = time.time()
    model = eval(f'{fit_fun}(params, X_train, y_train.values.ravel())')
    timing = time.time() - start
            
    r2 = model.score(X_test, y_test.values.ravel())
    
    return timing, r2

def build_generic_with_cv(fit_fun, params, X, y, print_cv_res=False):
    timing = None
    score = None

    start = time.time()
    model = eval(f'{fit_fun}(params, X, y.values.ravel())')
    timing = time.time() - start
    
    cv_res = cross_val_score(model, X, y.values.ravel(), cv=5, scoring='r2')
    r2 = cv_res.mean()
    
    if print_cv_res:
        print(cv_res)
    
    return timing, r2


def step(stepped_params, params_dict, active_param, previous_params):
    """
    Step the stepped_params according to the params_dict, which is formatted like {'param_name': [start, stop, step_size], ...} (e.g. [1,20,1])
    
    step_size can be negative, in this case start should be greater than stop (e.g. [20,1,-1])
    """  
        
    if not active_param:
        # new param has to be chosen at random from ones not used yet
        
        if not previous_params:
            # choose the first parameter to start stepping
            active_param = random.choice(list(stepped_params.keys()))
        else:
            for p in  stepped_params:
                if p in previous_params:
                    # found a yet unused param
                    continue
                active_param = p

    if not active_param:
        # no param was left to choose, reset previous and active params to start again
        return stepped_params, params_dict, None, []
   
    else:
        mult = random.sample([1,1,1,1,2], k=1)
        step_size = params_dict[active_param][2]
        step_size = step_size*2
        if math.copysign(1, step_size) < 0:
            # descending step
            if (stepped_params[active_param] + step_size) <= params_dict[active_param][1]:
                stepped_params[active_param] = params_dict[active_param][1]
                previous_params.append(active_param)
                active_param = None
            else:
                stepped_params[active_param] = stepped_params[active_param] + step_size
        else:
            # ascending step
            if (stepped_params[active_param] + step_size) >= params_dict[active_param][1]:
                stepped_params[active_param] = params_dict[active_param][1]
                previous_params.append(active_param)
                active_param = None
            else:
                stepped_params[active_param] = stepped_params[active_param] + step_size
    
        
        return stepped_params, params_dict, active_param, previous_params
    



def climb_generic(n_iter, fit_fun, params_dict, X_train, X_test, y_train, y_test):
    """
    Generic function to hillclimb tune *one* algorithm's parameters for *one* dataset
    
    fit_fun should be in (fit_elastic, fit_tree, fit_mlp)
    params_dict is a dict formatted like {'param_name': [start, stop, step_size], ...} (e.g. [1,20,1]), also works for negative descent which needs a negative step_size
    """
    # get starting values
    res = {}
    curr_params = {}
    active_param = ''
    previous_params = []
    
    for p in params_dict:
        curr_params[p] = params_dict[p][0]
    
    ret_timing, score = build_generic(fit_fun, curr_params, X_train, X_test, y_train, y_test)
    
    for i in range(n_iter):
        stepped_params, params_dict, active_param, previous_params = step(curr_params, params_dict, active_param, previous_params)
        ret_timing, ret_score = build_generic(fit_fun, stepped_params, X_train, X_test, y_train, y_test)
        
        if ret_score > score:
            curr_params, score = stepped_params, ret_score
            for p in curr_params:
                res[p] = curr_params[p]
            if _print_hill_step:
                print((f'iter {i} {res}: {score}'))
        else:
            previous_params.append(active_param)
            active_param = None
            
    
    return res, score

def climb_generic_with_cv(n_iter, fit_fun, params_dict, X, y, print_cv_res=False):
    """
    Generic function to hillclimb tune *one* algorithm's parameters for *one* dataset, using 5-fold cross validation with stratified splitting
    
    fit_fun should be in (fit_elastic, fit_tree, fit_mlp)
    params_dict is a dict formatted like {'param_name': [start, stop, step_size], ...} (e.g. [1,20,1]), also works for negative descent which needs a negative step_size
    """
    
    return climb_generic(n_iter, fit_fun, params_dict, X, y, print_cv_res)
    # get starting values
    res = {}
    curr_params = {}
    active_param = ''
    previous_params = []
    
    for p in params_dict:
        curr_params[p] = params_dict[p][0]
    
    ret_timing, score = build_generic_with_cv(fit_fun, curr_params, X_train, X_test, y_train, y_test, print_cv_res)
    
    for i in range(n_iter):
        stepped_params, params_dict, active_param, previous_params = step(curr_params, params_dict, active_param, previous_params)
        ret_timing, ret_score = build_generic_with_cv(fit_fun, stepped_params, X_train, X_test, y_train, y_test, print_cv_res)
        
        if ret_score > score:
            curr_params, score = stepped_params, ret_score
            for p in curr_params:
                res[p] = curr_params[p]
            if _print_hill_step:
                print((f'iter {i} {res}: {score}'))
        else:
            previous_params.append(active_param)
            active_param = None
            
    
    return res, score

In [3]:
def auto_ml(n_iter, alg_list, X, y):
    r2_to_beat = 0
    results = []
    winner = None
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    for alg, params_alg in alg_list:
        print(f'>>>>> tuning {alg} with params {params_alg} <<<<<')
        
        res_params, score = climb_generic(n_iter, alg, params_alg, X_train, X_test, y_train, y_test)
        print(f'{alg} {res_params}: {score}')
            
        res_dict = {
            'alg': alg,
            'res_params': res_params,
            'score': score,
        }
        results.append(res_dict)
        
        if score > r2_to_beat:
            winner = res_dict
        
    return winner, results

def auto_ml_with_cv(n_iter, alg_list, X, y):
    r2_to_beat = 0
    results = []
    winner = None
    
    for alg, params_alg in alg_list:
        print(f'>>>>> tuning {alg} with params {params_alg} <<<<<')
        
        res_params, score = climb_generic_with_cv(n_iter, alg, params_alg, X, y)
        print(f'{alg} {res_params}: {score}')
            
        res_dict = {
            'alg': alg,
            'res_params': res_params,
            'score': score,
        }
        results.append(res_dict)
        
        if score > r2_to_beat:
            winner = res_dict
        
    return winner, results

In [4]:
"""
parameter spaces to search
"""

params_elastic = {
    'alpha': [0.1, 10, 0.1],
    'l1_ratio': [0,1,0.1],
    'max_iter': [1000,100000,1000],
}

params_tree = {
    'max_depth': [4,32,4],
    'min_samples_split': [2,32,2],
    'min_samples_leaf': [4,64,4],
}

params_mlp = {
    'alpha': [0.001,0.01,0.001],
    'n_iter_no_change': [16,8,-1],
    'max_iter': [100,400,20]
}

alg_list = []
alg_list.append(['fit_elastic', params_elastic])
alg_list.append(['fit_tree', params_tree])
alg_list.append(['fit_mlp', params_mlp])

In [5]:
supress_warnings(True)
_print_hill_step = False
notebook_time = time.time()

## Breast Cancer Data

https://www.kaggle.com/c/184702-tu-ml-ws-21-breast-cancer/data#

In [6]:
breastcancer = pd.read_csv('preprocessed_data/breast-cancer-diagnostic.shuf.lrn_processed.csv', index_col=0)

breastcancer = breastcancer.apply(LabelEncoder().fit_transform)
display(breastcancer)

breastcancer_X = breastcancer.drop('target', axis=1)
breastcancer_y = breastcancer['target']

auto_ml(100, alg_list, breastcancer_X, breastcancer_y)

Unnamed: 0,target,0,1,2,3,4
0,0,115,24,67,216,99
1,0,57,94,121,97,51
2,1,180,59,119,151,192
3,0,29,260,283,6,279
4,0,222,278,169,284,6
...,...,...,...,...,...,...
280,1,280,8,6,181,222
281,1,210,217,37,78,215
282,0,56,48,100,224,58
283,1,238,19,153,193,139


>>>>> tuning fit_elastic with params {'alpha': [0.1, 10, 0.1], 'l1_ratio': [0, 1, 0.1], 'max_iter': [1000, 100000, 1000]} <<<<<
fit_elastic {'alpha': 10, 'l1_ratio': 0, 'max_iter': 1000}: 0.6448203193863158
>>>>> tuning fit_tree with params {'max_depth': [4, 32, 4], 'min_samples_split': [2, 32, 2], 'min_samples_leaf': [4, 64, 4]} <<<<<
fit_tree {}: 0.7488779064311066
>>>>> tuning fit_mlp with params {'alpha': [0.001, 0.01, 0.001], 'n_iter_no_change': [16, 8, -1], 'max_iter': [100, 400, 20]} <<<<<
fit_mlp {'alpha': 0.01, 'n_iter_no_change': 8, 'max_iter': 400}: -3.9285244961599313


({'alg': 'fit_tree', 'res_params': {}, 'score': 0.7488779064311066},
 [{'alg': 'fit_elastic',
   'res_params': {'alpha': 10, 'l1_ratio': 0, 'max_iter': 1000},
   'score': 0.6448203193863158},
  {'alg': 'fit_tree', 'res_params': {}, 'score': 0.7488779064311066},
  {'alg': 'fit_mlp',
   'res_params': {'alpha': 0.01, 'n_iter_no_change': 8, 'max_iter': 400},
   'score': -3.9285244961599313}])

## Concrete Data  

https://www.kaggle.com/prathamtripathi/regression-with-neural-networking

In [7]:
concrete = pd.read_csv('preprocessed_data/concrete_data_processed.csv', index_col=0)
display(concrete)

# preprocessing...

concrete_X = concrete.drop('Strength', axis=1)
concrete_y = pd.DataFrame(concrete['Strength'])

concrete_winner, concrete_results = auto_ml(100, alg_list, concrete_X, concrete_y)
concrete_winner

Unnamed: 0,Strength,0,1,2,3,4
0,79.99,0.214751,-0.396354,1.179814,0.125379,1.791528
1,61.89,0.217092,-0.405260,1.316810,0.112153,1.772801
2,40.27,5.606084,-0.435139,-0.407134,0.606248,0.280933
3,41.05,7.384075,0.295037,-0.399784,0.442201,0.292490
4,44.30,6.291732,2.459109,-0.156774,-0.423362,-0.869309
...,...,...,...,...,...,...
1025,44.28,-0.021691,0.374138,-0.863457,1.032798,0.113360
1026,31.18,-0.071329,0.483104,-1.429798,0.329372,0.130083
1027,23.70,0.080248,0.148299,-0.785831,1.135922,-0.881579
1028,32.77,-0.127364,0.365954,-0.014275,0.949949,-0.477926


>>>>> tuning fit_elastic with params {'alpha': [0.1, 10, 0.1], 'l1_ratio': [0, 1, 0.1], 'max_iter': [1000, 100000, 1000]} <<<<<
fit_elastic {'alpha': 0.1, 'l1_ratio': 0, 'max_iter': 3000}: 0.5084868095956344
>>>>> tuning fit_tree with params {'max_depth': [4, 32, 4], 'min_samples_split': [2, 32, 2], 'min_samples_leaf': [4, 64, 4]} <<<<<
fit_tree {'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 12}: 0.6404796395063501
>>>>> tuning fit_mlp with params {'alpha': [0.001, 0.01, 0.001], 'n_iter_no_change': [16, 8, -1], 'max_iter': [100, 400, 20]} <<<<<
fit_mlp {'alpha': 0.01, 'n_iter_no_change': 8, 'max_iter': 400}: 0.7167436605393384


{'alg': 'fit_mlp',
 'res_params': {'alpha': 0.01, 'n_iter_no_change': 8, 'max_iter': 400},
 'score': 0.7167436605393384}

## Seoul Bike Data  

https://archive-beta.ics.uci.edu/ml/datasets/seoul+bike+sharing+demand

In [8]:
seoulbike = pd.read_csv('preprocessed_data/SeoulBikeData_processed.csv', index_col=0) # index_col=0 since there is an index column
display(seoulbike)

seoulbike_X = seoulbike.drop('Rented Bike Count', axis=1)
seoulbike_y = seoulbike['Rented Bike Count']

auto_ml(100, alg_list, seoulbike_X, seoulbike_y)

Unnamed: 0,Rented Bike Count,0,1,2,3,4,5,6,7
0,254,-1.143428,1.720607,0.576782,0.171896,-0.362174,-0.509898,1.001663,-0.057253
1,204,-0.932272,1.855252,0.185010,0.350632,-0.943400,-0.811276,0.434694,-0.070336
2,173,-0.963323,1.843890,0.251450,0.322785,-0.806922,-0.760837,0.440454,-0.052111
3,107,-0.943335,1.849751,0.227829,0.331231,-0.802215,-0.779956,0.339291,-0.040505
4,78,-1.215630,1.710387,0.688144,0.203102,-0.143203,-0.506954,0.759400,-0.028478
...,...,...,...,...,...,...,...,...,...
8755,1003,-0.599769,0.239326,0.233114,1.551041,0.900681,0.632778,-0.111454,-0.099430
8756,764,-0.546395,0.269275,0.142696,1.615954,0.829516,0.544198,-0.251335,-0.094189
8757,694,-0.234729,0.479374,-0.427990,1.848822,-0.024300,0.137934,-1.038696,-0.107280
8758,712,-0.308480,0.428752,-0.239147,1.691474,0.331681,0.350278,-0.878422,-0.058397


>>>>> tuning fit_elastic with params {'alpha': [0.1, 10, 0.1], 'l1_ratio': [0, 1, 0.1], 'max_iter': [1000, 100000, 1000]} <<<<<
fit_elastic {'alpha': 0.30000000000000004, 'l1_ratio': 1, 'max_iter': 3000}: 0.4696670117932351
>>>>> tuning fit_tree with params {'max_depth': [4, 32, 4], 'min_samples_split': [2, 32, 2], 'min_samples_leaf': [4, 64, 4]} <<<<<
fit_tree {'max_depth': 28, 'min_samples_split': 6, 'min_samples_leaf': 28}: 0.6129739203400288
>>>>> tuning fit_mlp with params {'alpha': [0.001, 0.01, 0.001], 'n_iter_no_change': [16, 8, -1], 'max_iter': [100, 400, 20]} <<<<<
fit_mlp {'alpha': 0.01, 'n_iter_no_change': 8, 'max_iter': 400}: 0.6045875957905931


({'alg': 'fit_mlp',
  'res_params': {'alpha': 0.01, 'n_iter_no_change': 8, 'max_iter': 400},
  'score': 0.6045875957905931},
 [{'alg': 'fit_elastic',
   'res_params': {'alpha': 0.30000000000000004,
    'l1_ratio': 1,
    'max_iter': 3000},
   'score': 0.4696670117932351},
  {'alg': 'fit_tree',
   'res_params': {'max_depth': 28,
    'min_samples_split': 6,
    'min_samples_leaf': 28},
   'score': 0.6129739203400288},
  {'alg': 'fit_mlp',
   'res_params': {'alpha': 0.01, 'n_iter_no_change': 8, 'max_iter': 400},
   'score': 0.6045875957905931}])

In [9]:
print(f'notebook took this long in seconds: {time.time()-notebook_time}')

notebook took this long in seconds: 1029.4496788978577
