# Machine Learning Exercise 2 - Regression and AutoML

In [1]:
import time
import math
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Perceptron

import warnings

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 999)

def supress_warnings(suppress):
    if suppress:
        warnings.filterwarnings('ignore') # hide all warnings
    else:
        warnings.filterwarnings('default') # default warnings settings

_print_hill_step = False
def show_print_hill_step(show):
    if show:
        _print_hill_step = True
    else:
        _print_hill_step = False

In [2]:
def all_lower(val, list):
    """
    returns False if there is an element in list greater val, True otherwise (including list=[])
    """
    return(all(x < val for x in list))

def fit_elastic(params, X_train, y_train):
    elastic_model = ElasticNet(**params)
    elastic_model.fit(X_train, y_train)
    return elastic_model

def fit_tree(params, X_train, y_train):
    tree_model = RandomForestRegressor(**params)
    tree_model.fit(X_train, y_train)
    return tree_model

def fit_mlp(params, X_train, y_train):
    mlp_model = MLPRegressor(**params)
    mlp_model.fit(X_train, y_train)
    return mlp_model

def build_generic(fit_fun, params, X_train, X_test, y_train, y_test):
    timing = None
    score = None

    start = time.time()
    model = eval(f'{fit_fun}(params, X_train, y_train.values.ravel())')
    timing = time.time() - start
            
    r2 = model.score(X_test, y_test.values.ravel())
    
    return timing, r2

def build_generic_with_cv(fit_fun, params, X, y, print_cv_res=False):
    timing = None
    score = None

    start = time.time()
    model = eval(f'{fit_fun}(params, X, y.values.ravel())')
    timing = time.time() - start
    
    cv_res = cross_val_score(model, X, y.values.ravel(), cv=10, scoring='r2')
    r2 = cv_res.mean()
    
    if print_cv_res:
        print(cv_res)
    
    return timing, r2

def step(stepped_params, params_dict):
    """
    Step the stepped_params according to the params_dict, which is formatted like {'param_name': [start, stop, step_size], ...} (e.g. [1,20,1])
    
    step_size can be negative, in this case start should be greater than stop (e.g. [20,1,-1])
    """
    
    # TODO: change only one param at a time until no improvement
    # TODO: add randomness
    
    for p in params_dict:
        if params_dict[p][0] >  params_dict[p][1]:
            # descending
            if (stepped_params[p] + params_dict[p][2]) < params_dict[p][1]:
                stepped_params[p] = params_dict[p][1]
            else:
                stepped_params[p] = stepped_params[p] + params_dict[p][2]
        else:
            # ascending
            if (stepped_params[p] + params_dict[p][2]) > params_dict[p][1]:
                stepped_params[p] = params_dict[p][1]
            else:
                stepped_params[p] = stepped_params[p] + params_dict[p][2]
    return stepped_params



def climb_generic(n_iter, fit_fun, params_dict, X_train, X_test, y_train, y_test):
    """
    Generic function to hillclimb tune *one* algorithm's parameters for *one* dataset
    
    fit_fun should be in (fit_elastic, fit_tree, fit_mlp)
    params_dict is a dict formatted like {'param_name': [start, stop, step_size], ...} (e.g. [1,20,1]), also works for negative descent which needs a negative step_size
    """
    # get starting values
    res = {}
    curr_params = {}
    for p in params_dict:
        curr_params[p] = params_dict[p][0]
    
    ret_timing, score = build_generic(fit_fun, curr_params, X_train, X_test, y_train, y_test)

    for i in range(n_iter):
        stepped_params = step(curr_params, params_dict)
        ret_timing, ret_score = build_generic(fit_fun, stepped_params, X_train, X_test, y_train, y_test)

        if ret_score > score:
            curr_params, score = stepped_params, ret_score
            for p in curr_params:
                res[p] = curr_params[p]
            if _print_hill_step:
                print((f'iter {i} {res}: {score}'))
            
    
    return res, score

def climb_generic_with_cv(n_iter, fit_fun, params_dict, X, y, print_cv_res=False):
    """
    Generic function to hillclimb tune *one* algorithm's parameters for *one* dataset, using 5-fold cross validation with stratified splitting
    
    fit_fun should be in (fit_elastic, fit_tree, fit_mlp)
    params_dict is a dict formatted like {'param_name': [start, stop, step_size], ...} (e.g. [1,20,1]), also works for negative descent which needs a negative step_size
    """
    # get starting values
    res = {}
    curr_params = {}
    for p in params_dict:
        res[p] = params_dict[p][0]
        curr_params[p] = params_dict[p][0]
    
    ret_timing, score = build_generic_with_cv(fit_fun, curr_params, X, y, print_cv_res)

    for i in range(n_iter):
        stepped_params = step(curr_params, params_dict)
        ret_timing, ret_score = build_generic_with_cv(fit_fun, stepped_params, X, y, print_cv_res)

        if ret_score > score:
            curr_params, score = stepped_params, ret_score
            for p in curr_params:
                res[p] = curr_params[p]
            if _print_hill_step:
                print((f'iter {i} {res}: {score}'))
            
    
    return res, score

In [3]:
def auto_ml(n_iter, alg_list, X, y):
    r2_to_beat = 0
    results = []
    winner = None
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    for alg, params_alg in alg_list:
        print(f'>>>>> tuning {alg} with params {params_alg} <<<<<')
        
        res_params, score = climb_generic(n_iter, alg, params_alg, X_train, X_test, y_train, y_test)
        print(f'{alg} {res_params}: {score}')
            
        res_dict = {
            'alg': alg,
            'res_params': res_params,
            'score': score,
        }
        results.append(res_dict)
        
        if score > r2_to_beat:
            winner = res_dict
        
    return winner, results

def auto_ml_with_cv(n_iter, alg_list, X, y):
    r2_to_beat = 0
    results = []
    winner = None
    
    for alg, params_alg in alg_list:
        print(f'>>>>> tuning {alg} with params {params_alg} <<<<<')
        
        res_params, score = climb_generic_with_cv(n_iter, alg, params_alg, X, y)
        print(f'{alg} {res_params}: {score}')
            
        res_dict = {
            'alg': alg,
            'res_params': res_params,
            'score': score,
        }
        results.append(res_dict)
        
        if score > r2_to_beat:
            winner = res_dict
        
    return winner, results

In [4]:
"""
parameter spaces to search
"""

params_elastic = {
    'l1_ratio': [1,0,-0.01],
    'max_iter': [1000,100000,1000],
    'tol': [0.00001, 0.0001, 0.000001]
}

params_tree = {
    'max_depth': [1,20,1],
    'min_samples_leaf': [1,10,1],
}

params_mlp = {
    'alpha': [0.0001,0.001,0.00001],
    'n_iter_no_change': [14,6,-1],
    'max_iter': [200,400,10]
}

alg_list = []
alg_list.append(['fit_elastic', params_elastic])
alg_list.append(['fit_tree', params_tree])
alg_list.append(['fit_mlp', params_mlp])

In [5]:
supress_warnings(True)
show_print_hill_step(False)
notebook_time = time.time()

## Concrete Data  

https://www.kaggle.com/prathamtripathi/regression-with-neural-networking

In [6]:
concrete = pd.read_csv('concrete/concrete_data.csv')
display(concrete)

# preprocessing...

concrete_X = concrete.drop('Strength', axis=1)
concrete_y = pd.DataFrame(concrete['Strength'])

concrete_winner, concrete_results = auto_ml_with_cv(10, alg_list, concrete_X, concrete_y)
concrete_winner

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


>>>>> tuning fit_elastic with params {'l1_ratio': [1, 0, -0.01], 'max_iter': [1000, 100000, 1000], 'tol': [1e-05, 0.0001, 1e-06]} <<<<<
fit_elastic {'l1_ratio': 0.8999999999999999, 'max_iter': 11000, 'tol': 2.0000000000000005e-05}: 0.27761698041396965
>>>>> tuning fit_tree with params {'max_depth': [1, 20, 1], 'min_samples_leaf': [1, 10, 1]} <<<<<
fit_tree {'max_depth': 8, 'min_samples_leaf': 8}: 0.6735911052562801
>>>>> tuning fit_mlp with params {'alpha': [0.0001, 0.001, 1e-05], 'n_iter_no_change': [14, 6, -1], 'max_iter': [200, 400, 10]} <<<<<
fit_mlp {'alpha': 0.0001, 'n_iter_no_change': 14, 'max_iter': 200}: 0.6771521596352904


{'alg': 'fit_mlp',
 'res_params': {'alpha': 0.0001, 'n_iter_no_change': 14, 'max_iter': 200},
 'score': 0.6771521596352904}

In [7]:
# show_print_hill_step(True)

auto_ml(20, alg_list, concrete_X, concrete_y)
#climb_generic_with_cv(20, 'fit_tree', params_tree, concrete_X, concrete_y, print_cv_res=True)

>>>>> tuning fit_elastic with params {'l1_ratio': [1, 0, -0.01], 'max_iter': [1000, 100000, 1000], 'tol': [1e-05, 0.0001, 1e-06]} <<<<<
fit_elastic {}: 0.5674634080606947
>>>>> tuning fit_tree with params {'max_depth': [1, 20, 1], 'min_samples_leaf': [1, 10, 1]} <<<<<
fit_tree {'max_depth': 8, 'min_samples_leaf': 8}: 0.8744487175296968
>>>>> tuning fit_mlp with params {'alpha': [0.0001, 0.001, 1e-05], 'n_iter_no_change': [14, 6, -1], 'max_iter': [200, 400, 10]} <<<<<
fit_mlp {}: 0.857696792366163


({'alg': 'fit_mlp', 'res_params': {}, 'score': 0.857696792366163},
 [{'alg': 'fit_elastic', 'res_params': {}, 'score': 0.5674634080606947},
  {'alg': 'fit_tree',
   'res_params': {'max_depth': 8, 'min_samples_leaf': 8},
   'score': 0.8744487175296968},
  {'alg': 'fit_mlp', 'res_params': {}, 'score': 0.857696792366163}])

## Seoul Bike Data  

https://archive-beta.ics.uci.edu/ml/datasets/seoul+bike+sharing+demand

In [10]:
seoulbike = pd.read_csv('seoulbike/SeoulBikeData.csv')

seoulbike = seoulbike.apply(LabelEncoder().fit_transform)
seoulbike = seoulbike.drop('Date', axis=1)
display(seoulbike)



seoulbike_X = seoulbike.drop('Rented Bike Count', axis=1)
seoulbike_y = seoulbike['Rented Bike Count']

auto_ml(10, alg_list, seoulbike_X, seoulbike_y)

Unnamed: 0,Rented Bike Count,Hour,Temperature(degC),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(degC),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,253,0,111,28,22,1788,114,0,0,0,3,1,1
1,203,1,108,29,8,1788,114,0,0,0,3,1,1
2,172,2,103,30,10,1788,113,0,0,0,3,1,1
3,106,3,101,31,9,1788,114,0,0,0,3,1,1
4,77,4,103,27,23,1788,104,0,0,0,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,991,19,205,25,26,1682,187,0,0,0,0,1,1
8756,755,20,197,28,23,1788,191,0,0,0,0,1,1
8757,686,21,189,30,3,1756,191,0,0,0,0,1,1
8758,704,22,184,32,10,1648,192,0,0,0,0,1,1


>>>>> tuning fit_elastic with params {'l1_ratio': [1, 0, -0.01], 'max_iter': [1000, 100000, 1000], 'tol': [1e-05, 0.0001, 1e-06]} <<<<<
fit_elastic {}: 0.5921322797165363
>>>>> tuning fit_tree with params {'max_depth': [1, 20, 1], 'min_samples_leaf': [1, 10, 1]} <<<<<
fit_tree {'max_depth': 11, 'min_samples_leaf': 10}: 0.8678702365738358
>>>>> tuning fit_mlp with params {'alpha': [0.0001, 0.001, 1e-05], 'n_iter_no_change': [14, 6, -1], 'max_iter': [200, 400, 10]} <<<<<
fit_mlp {'alpha': 0.00015000000000000001, 'n_iter_no_change': 9, 'max_iter': 250}: 0.6345587182953808


({'alg': 'fit_mlp',
  'res_params': {'alpha': 0.00015000000000000001,
   'n_iter_no_change': 9,
   'max_iter': 250},
  'score': 0.6345587182953808},
 [{'alg': 'fit_elastic', 'res_params': {}, 'score': 0.5921322797165363},
  {'alg': 'fit_tree',
   'res_params': {'max_depth': 11, 'min_samples_leaf': 10},
   'score': 0.8678702365738358},
  {'alg': 'fit_mlp',
   'res_params': {'alpha': 0.00015000000000000001,
    'n_iter_no_change': 9,
    'max_iter': 250},
   'score': 0.6345587182953808}])

## Breast Cancer Data

In [None]:
# https://www.kaggle.com/c/184702-tu-ml-ws-21-breast-cancer/data#
breastcancer = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.lrn.csv')

breastcancer = breastcancer.apply(LabelEncoder().fit_transform)
display(breastcancer)


breastcancer_X = breastcancer.drop('class', axis=1)
breastcancer_y = breastcancer['class']

auto_ml_with_cv(10, alg_list, breastcancer_X, breastcancer_y)

In [None]:
print(f'notebook took this long in seconds: {time.time()-notebook_time}')