In [3]:
import numpy as np

import pandas as pd
from pandas.plotting import scatter_matrix
from sqlalchemy import create_engine
cnx = create_engine('sqlite:///:memory:')

from scipy.stats import randint
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer 

from sklearn.preprocessing import OrdinalEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import LassoLars

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsTransformer

from sklearn.feature_selection import SelectFromModel 

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import BaggingClassifier

from sklearn.svm import SVR
from sklearn.svm import LinearSVR

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score         #not supporting multiclass
from sklearn.metrics import average_precision_score #not supporting multiclass

pd.set_option('display.max_column',None)
pd.set_option('display.max_row',1000)

pd.options.mode.use_inf_as_na = True
np.random.seed(42)

In [4]:
#num_attribs = pd.read_csv("s3://dadadata/iMa/num_attribs.csv")
#cat_attribs = pd.read_csv("s3://dadadata/iMa/cat_attribs.csv")
#runs_labels = pd.read_csv("s3://dadadata/iMa/runs_labels.csv")

num_attribs = pd.read_csv("/Users/mk2/Desktop/iMa/Model/num_attribs.csv")
cat_attribs = pd.read_csv("/Users/mk2/Desktop/iMa/Model/cat_attribs.csv")
runs_labels = pd.read_csv("/Users/mk2/Desktop/iMa/Model/runs_labels.csv")

num_attribs = num_attribs.drop(columns=["Unnamed: 0"])
runs_labels = runs_labels.drop(columns=["Unnamed: 0"])
cat_attribs = cat_attribs.drop(columns=["Unnamed: 0"])


Unnamed: 0,horse_country,horse_type,venue,config,going,horse_ratings,date,horse_gear,raced
0,AUS,Gelding,ST,A,GOOD TO FIRM,40-15,1997-06-02,--,0
1,NZ,Gelding,ST,A,GOOD TO FIRM,40-15,1997-06-02,--,0
2,NZ,Gelding,ST,A,GOOD TO FIRM,40-15,1997-06-02,--,0
3,SAF,Gelding,ST,A,GOOD TO FIRM,40-15,1997-06-02,--,0
4,GB,Gelding,ST,A,GOOD TO FIRM,40-15,1997-06-02,--,0
5,NZ,Gelding,ST,A,GOOD TO FIRM,40-15,1997-06-02,--,0
6,NZ,Gelding,ST,A,GOOD TO FIRM,40-15,1997-06-02,--,0
7,AUS,Gelding,ST,A,GOOD TO FIRM,40-15,1997-06-02,--,0
8,NZ,Gelding,ST,A,GOOD TO FIRM,40-15,1997-06-02,--,0
9,AUS,Mare,ST,A,GOOD TO FIRM,40-15,1997-06-02,--,0


# Regressions on numerical attributes ZONE !

#Linear Regression on num_attribs

In [4]:
linear_regression = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ("standard_scaler", StandardScaler(with_mean = True, with_std = True)),
    ("linear_reg", LinearRegression(fit_intercept= True, normalize=False)),
])

In [5]:
linear_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(linear_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error",n_jobs=-1, cv=10,verbose=3)
pd.Series(np.sqrt(-scores)).describe()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    2.0s remaining:    4.6s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.1s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.2s finished


count    10.000000
mean      3.206531
std       0.034833
min       3.165202
25%       3.175629
50%       3.207522
75%       3.225078
max       3.259848
dtype: float64

In [6]:
linear_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    'linear_reg__fit_intercept':[True, False],
    'linear_reg__normalize':[True, False]
}



In [None]:
linear_regression_search = RandomizedSearchCV(linear_regression, param_distributions=linear_regression_paragrid,
                                n_iter=1000, cv=5, scoring='neg_mean_squared_error',n_jobs=-1, random_state=42, verbose=3)
linear_regression_search.fit(num_attribs, runs_labels)

In [6]:
linear_regression_search.best_params_

{'standard_scaler__with_std': True,
 'standard_scaler__with_mean': True,
 'linear_reg__normalize': False,
 'linear_reg__fit_intercept': True,
 'imputer__strategy': 'mean'}



KNN Regression /Multiclass Classification/ Transformer

In [7]:
KNN_regression = Pipeline([
    
    ('imputer', SimpleImputer(strategy='median')),
    ("standard_scaler", StandardScaler()),
    ("KNN", KNeighborsRegressor(n_neighbors = 15)),
    
])



In [8]:
KNN_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(KNN_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error", cv=10, verbose=3,n_jobs=-1)
pd.Series(np.sqrt(-scores)).describe()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   57.5s remaining:  2.2min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   58.3s remaining:   25.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.5min finished


count    10.000000
mean      3.344829
std       0.033940
min       3.300218
25%       3.318486
50%       3.345625
75%       3.361793
max       3.404980
dtype: float64

In [12]:
KNN_classification = Pipeline([
    
    ('imputer', SimpleImputer()),
    ("standard_scaler", StandardScaler()),
    ("KNN", KNeighborsClassifier(n_neighbors = 15)),
    
])



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   56.5s remaining:  2.2min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   57.2s remaining:   24.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.5min finished


count    10.000000
mean      4.420086
std       0.052766
min       4.347985
25%       4.385676
50%       4.422192
75%       4.445493
max       4.509779
dtype: float64

In [None]:
KNN_classification.fit(num_attribs, np.ravel(runs_labels))
scores = cross_val_score(KNN_classification, num_attribs, runs_labels,scoring="neg_mean_squared_error", cv=10, verbose=3,n_jobs=-1)
pd.Series(np.sqrt(-scores)).describe()

# Stochastic Gradient Descent Regression

In [7]:
stochastic_regression = Pipeline([
    
    ('imputer', SimpleImputer()),
    ("standard_scaler", StandardScaler()),
    ("sgd_reg", SGDRegressor()),
    
])



  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   27.3s finished


count    1000.000000
mean        3.204247
std         0.260180
min         2.454320
25%         3.023265
50%         3.199291
75%         3.368492
max         4.097109
dtype: float64

In [None]:
stochastic_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(stochastic_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error", cv=1000, verbose=3,n_jobs=-1)
pd.Series(np.sqrt(-scores)).describe()

In [75]:
SGDRegressor.get_params(SGDRegressor).keys()

dict_keys(['alpha', 'average', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter', 'n_iter_no_change', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [13]:
stochastic_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    'sdg_reg__alpha':np.linspace(1.6180e-8,1.6180e-3, num=16),
    'sdg_reg__early_stopping':[True, False],
    #'sdg_reg__epsilon':randint(low=0.01, high=1),
    'sdg_reg__eta0':np.linspace(1e-4,1e-1, num=10),
    'sdg_reg__fit_intercept':[True, False],
    #'sdg_reg__l1_ratio':randint(low=0, high=1),
    'sdg_reg__learning_rate':['constant','optimal','invscaling','adaptive'],
    'sdg_reg__loss':['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive'],
    'sdg_reg__max_iter':randint(low=500, high=5000),
    'sdg_reg__n_iter':randint(low=1000, high=10000), 
    'sdg_reg__penalty':['l2','l1’','elasticnet'],
    'sdg_reg__power_t':np.linspace(1e-4,1e-1, num=10),
    'sdg_reg__shuffle':[True, False],    
    'sdg_reg__tol':[None,np.linspace(1.6180e-4,1.6180e-1, num=8)],
    'sdg_reg__validation_fraction':np.linspace(1e-3,1e-1, num=10), 
    
}




In [14]:
stochastic_regression_paragrid = RandomizedSearchCV(stochastic_regression, param_distributions=stochastic_regression_paragrid,
                                n_iter=100, cv=5, scoring='neg_mean_squared_error', random_state=42, verbose=3)
stochastic_regression_paragrid.fit(num_attribs, runs_labels)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] imputer__strategy=most_frequent, sdg_reg__alpha=0.00032361294400000005, sdg_reg__early_stopping=True, sdg_reg__eta0=0.07780000000000001, sdg_reg__fit_intercept=True, sdg_reg__learning_rate=constant, sdg_reg__loss=epsilon_insensitive, sdg_reg__max_iter=966, sdg_reg__n_iter=5426, sdg_reg__penalty=elasticnet, sdg_reg__power_t=0.07780000000000001, sdg_reg__shuffle=True, sdg_reg__tol=[0.0001618  0.02325297 0.04634414 0.06943531 0.09252649 0.11561766
 0.13870883 0.1618    ], sdg_reg__validation_fraction=0.07800000000000001, standard_scaler__with_mean=False, standard_scaler__with_std=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: Invalid parameter sdg_reg for estimator Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('standard_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('sgd_reg', SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0....m_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [10]:
from sklearn.neural_network import MLPRegressor

mlp_regression = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    #("poly_features", PolynomialFeatures(degree=2,include_bias = False)),
    ("standard_scaler", StandardScaler()),
    ("mlp", MLPRegressor(max_iter=1000)),
])



  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  4.0min remaining:  9.3min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  4.9min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  5.7min finished


count    10.000000
mean      3.413219
std       0.171327
min       3.223821
25%       3.297021
50%       3.360226
75%       3.462463
max       3.751493
dtype: float64

In [None]:
mlp_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(mlp_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error", n_jobs=-1, cv=10, verbose =3)
pd.Series(np.sqrt(-scores)).describe()

#Polynomial Regression on num_attribs

In [4]:
polynomial_regression = Pipeline([
    
    ('imputer', SimpleImputer(strategy="median")),
    ("poly_features", PolynomialFeatures(degree=2)),
    ("lin_reg_poly", LinearRegression()),
    
])



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 13.8min finished


count    1000.000000
mean        3.148229
std         0.323867
min         2.400870
25%         2.936714
50%         3.143052
75%         3.329512
max         6.652358
dtype: float64

In [None]:
polynomial_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(polynomial_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error", cv=1000, n_jobs = -1, verbose =3)
pd.Series(np.sqrt(-scores)).describe()

In [7]:
poly_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'lin_reg_poly__fit_intercept':[True, False],
    'lin_reg_poly__normalize':[True, False],
    'poly_features__interaction_only':[True, False],
    'poly_features__include_bias':[True, False],
    #'poly_features__order':['C', 'F'],
    'poly_features__degree':randint(low=2, high=4),
    
    
}

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
exception calling callback for <Future at 0x7f7ed521a128 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 309, in __call__
    self.parallel.dispatch_next()
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 731, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 759, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-package

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGKILL(-9)}

In [None]:
poly_regression_search = RandomizedSearchCV(polynomial_regression, param_distributions=poly_regression_paragrid,
                                n_iter=96, cv=5, scoring='neg_mean_squared_error',n_jobs= 4, random_state=42, verbose=3)
poly_regression_search.fit(num_attribs, runs_labels)

In [41]:
poly_regression_search.best_params_

NameError: name 'poly_regression_search' is not defined

In [19]:
polynomial_regression_ridge = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ("poly_features", PolynomialFeatures(degree=2,include_bias = False)),
    ("ridge", Ridge()),
])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  1.7min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  1.9min remaining:   49.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.1min finished


count    10.000000
mean      3.145366
std       0.045608
min       3.069413
25%       3.123518
50%       3.138429
75%       3.167354
max       3.221380
dtype: float64

In [None]:
polynomial_regression_ridge.fit(num_attribs, runs_labels)
scores = cross_val_score(polynomial_regression_ridge, num_attribs, runs_labels,scoring="neg_mean_squared_error",n_jobs=-1, cv=10, verbose =3)
pd.Series(np.sqrt(-scores)).describe()

In [19]:
Ridge().get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'random_state', 'solver', 'tol'])

In [24]:
poly_regression_ridge_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'poly_features__interaction_only':[True, False],
    'poly_features__include_bias':[True, False],
    #'poly_features__degree':randint(low=2, high=4),
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    'ridge__alpha': np.linspace(1, 0.01, num=8),
    'ridge__fit_intercept':[True, False],
    'ridge__normalize':[True, False],
    'ridge__copy_X':[True, False],
    'ridge__max_iter':[None, np.linspace(500, 5000, num=8)],
    'ridge__tol': np.linspace(1e-3, 0.1, num=8),
    #'ridge__random_state':[42],
    'ridge__solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],   
    
}


In [None]:

poly_regression_ridge_search = RandomizedSearchCV(polynomial_regression_ridge, param_distributions=poly_regression_ridge_paragrid,
                                n_iter=480, cv=8, scoring='neg_mean_squared_error',n_jobs= -1, random_state=42, verbose=3)
poly_regression_ridge_search.fit(num_attribs, runs_labels)

In [16]:
polynomial_regression_ridge.get_params().keys()

dict_keys(['memory', 'steps', 'imputer', 'poly_features', 'standard_scaler', 'ridge', 'imputer__copy', 'imputer__fill_value', 'imputer__missing_values', 'imputer__strategy', 'imputer__verbose', 'poly_features__degree', 'poly_features__include_bias', 'poly_features__interaction_only', 'standard_scaler__copy', 'standard_scaler__with_mean', 'standard_scaler__with_std', 'ridge__alpha', 'ridge__copy_X', 'ridge__fit_intercept', 'ridge__max_iter', 'ridge__normalize', 'ridge__random_state', 'ridge__solver', 'ridge__tol'])

# Logistic Regression 

In [17]:
logistic_regression = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    #("standard_scaler", StandardScaler()),
    ("log_reg",LogisticRegression(warm_start=True, random_state = 42)),
])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    7.6s remaining:   17.7s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    7.6s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.9s finished


count    10.000000
mean      5.545937
std       0.859523
min       3.907662
25%       5.290387
50%       5.828451
75%       6.156004
max       6.323112
dtype: float64

In [None]:
logistic_regression.fit(num_attribs, np.ravel(runs_labels))
scores = cross_val_score(logistic_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error",n_jobs=-1,cv=10, verbose =3)
pd.Series(np.sqrt(-scores)).describe()

In [101]:
logistic_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    'log_reg__penalty':['l2','l1'],
    'log_reg__solver':['newton-cg','lbfgs','sag','saga'],
    'log_reg__multi_class':['auto','ovr','multinomial'],
    'log_reg__max_iter':randint(low=100, high=5000),
    'log_reg__class_weight':[None,'balanced'],
    'log_reg__tol':randint(low=0.0001, high=0.1),
    'log_reg__dual':[True, False],
    'log_reg__C':[10,1,0.1,0.01],
    'log_reg__fit_intercept':[True, False],
    'log_reg__l1_ratio':linspace(low=0, high=1),
 
}


In [1]:
logistic_regression_search = RandomizedSearchCV(logistic_regression, param_distributions=logistic_regression_paragrid,
                                n_iter=64, cv=8, scoring='neg_mean_squared_error',n_jobs=-1, random_state=42, verbose=3)
logistic_regression_search.fit(num_attribs, runs_labels)

NameError: name 'RandomizedSearchCV' is not defined

# Forest Regression

In [4]:
forest_regression = Pipeline([
    
    ("imputer", SimpleImputer(strategy="median")),
    #("standard_scaler", StandardScaler()),
    ("forest_reg",RandomForestRegressor()),
    
])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 51.1min finished


count    100.000000
mean       3.137347
std        0.106515
min        2.907236
25%        3.062905
50%        3.141245
75%        3.204257
max        3.368740
dtype: float64

In [None]:
forest_regression.fit(num_attribs, np.ravel(runs_labels))
scores = cross_val_score(forest_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error", n_jobs=-1, cv=100, verbose = 3)
pd.Series(np.sqrt(-scores)).describe()

In [17]:
forest_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'forest_reg__bootstrap': [True, False],
    'forest_reg__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'forest_reg__max_features': ['auto', 'sqrt'],
    'forest_reg__min_samples_leaf': [1, 2, 4],
    'forest_reg__min_samples_split': [2, 5, 10],
    'forest_reg__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
}
   


In [20]:
forest_regression_search = RandomizedSearchCV(forest_regression, param_distributions=forest_regression_paragrid,
                                n_iter=96, cv=8, scoring='neg_mean_squared_error', random_state=42, verbose=3, n_jobs=-1)
forest_regression_search.fit(num_attribs, np.ravel(runs_labels))

Fitting 8 folds for each of 96 candidates, totalling 768 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


KeyboardInterrupt: 

# Support Vector Regression 

In [9]:
support_vector_regression = Pipeline([
    
    ("imputer", SimpleImputer()),
    ("standard_scaler", StandardScaler()),
    ("svr_reg",SVR()),
])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  8.3min remaining: 19.4min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  8.3min remaining:  3.6min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 12.4min finished


count    10.000000
mean      3.185845
std       0.038250
min       3.119619
25%       3.165919
50%       3.192995
75%       3.215816
max       3.227525
dtype: float64

In [None]:
support_vector_regression.fit(num_attribs, np.ravel(runs_labels))
scores = cross_val_score(support_vector_regression, num_attribs, runs_labels, scoring="neg_mean_squared_error", cv=10, verbose = 3,n_jobs =-1)
pd.Series(np.sqrt(-scores)).describe()

In [15]:
lin_support_vector_regression = Pipeline([
    
    ("imputer", SimpleImputer()),
    ("standard_scaler", StandardScaler()),
    ("lin_svr_reg",LinearSVR()),
    
])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   58.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.5min finished


count    100.000000
mean       3.228912
std        0.110988
min        2.998395
25%        3.149579
50%        3.224428
75%        3.309062
max        3.501682
dtype: float64

In [None]:
lin_support_vector_regression.fit(num_attribs, np.ravel(runs_labels))
scores = cross_val_score(lin_support_vector_regression, num_attribs, runs_labels, scoring="neg_mean_squared_error", cv=100, verbose = 3,n_jobs =-1)
pd.Series(np.sqrt(-scores)).describe()

In [18]:
poly_support_vector_regression = Pipeline([
    
    ("imputer", SimpleImputer()),
    ("poly_features", PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)),
    ("standard_scaler", StandardScaler()),
    ("lin_svr_reg",LinearSVR()),
    
])

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed: 17.7min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 66.9min finished


count    100.000000
mean       3.267111
std        0.329290
min        2.930449
25%        3.119198
50%        3.212109
75%        3.312653
max        5.780656
dtype: float64

In [None]:
poly_support_vector_regression.fit(num_attribs, np.ravel(runs_labels))
scores = cross_val_score(poly_support_vector_regression, num_attribs, runs_labels, scoring="neg_mean_squared_error", cv=10, verbose = 3,n_jobs =4)
pd.Series(np.sqrt(-scores)).describe()

In [None]:
poly_kernel_support_vector_regression = Pipeline([
    
    ("imputer", SimpleImputer()),
    ("standard_scaler", StandardScaler()),
    ("lin_svr_reg",SVR(kernel="poly", degree=2)),
    
])

In [None]:
poly_kernel_support_vector_regression.fit(num_attribs, np.ravel(runs_labels))
scores = cross_val_score(poly_kernel_support_vector_regression, num_attribs, runs_labels, scoring="neg_mean_squared_error", cv=10, verbose = 3,n_jobs =4)
pd.Series(np.sqrt(-scores)).describe()

# Lasso LARS 

In [23]:
linear_lasso_lars_regression = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ("standard_scaler", StandardScaler(with_std = True, with_mean = False)),
    ("lasso_lars", LassoLars(alpha=0.001618, fit_intercept= False,fit_path= False, max_iter= 1869, positive = False, 
                            precompute= True, normalize = True,eps = 0.001618)),
])

In [None]:
linear_lasso_lars_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(linear_lasso_lars_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error",n_jobs=-1, cv=60000, verbose=3)
pd.Series(np.sqrt(-scores)).describe()

In [None]:
lasso_lars_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    'lasso_lars__fit_intercept':[True, False],
    'lasso_lars__max_iter':randint(low=1000, high=10000),
    'lasso_lars__normalize':[True, False],
    'lasso_lars__precompute':[True, False],
    'lasso_lars__eps':np.linspace(1.6180e-16,1.6180e-8, num=100),
    'lasso_lars__alpha':np.linspace(1.6180e-6,1.6180e-3, num=100),
    'lasso_lars__fit_path':[True, False],
    'lasso_lars__positive':[True, False],
    #'lasso_lars__jitter':[None,np.linspace(1.6180e-6,1.6180e-3, num=100)],
}

In [None]:
lasso_lars_regression_search = RandomizedSearchCV(linear_lasso_lars_regression, param_distributions=lasso_lars_regression_paragrid,
                                n_iter=1000, cv=10, scoring='neg_mean_squared_error',n_jobs=16, random_state=42, verbose=3)
lasso_lars_regression_search.fit(num_attribs, runs_labels)
lasso_lars_regression_search.best_params_

In [10]:
polynomial_regression_lasso_lars = Pipeline([
    
    ('imputer', SimpleImputer(strategy="median")),
    #('features_selection',SelectFromModel(estimator=(LinearRegression()))),
    
    ("poly_features", PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)),
    ("standard_scaler", StandardScaler(with_mean=False, with_std=True)),
    ("lasso_lars", LassoLars(alpha=0.001618, fit_intercept= False,fit_path= False, max_iter= 1869, positive = False, 
                            precompute= True, normalize = True,eps = 0.001618)),
])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   16.4s remaining:   38.3s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   20.8s remaining:    8.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   21.8s finished


count    10.000000
mean      3.124574
std       0.037218
min       3.065764
25%       3.096118
50%       3.127850
75%       3.145401
max       3.184504
dtype: float64

In [None]:
polynomial_regression_lasso_lars.fit(num_attribs, runs_labels)
scores = cross_val_score(polynomial_regression_lasso_lars, num_attribs, runs_labels,scoring="neg_mean_squared_error", cv=10,n_jobs=-1 ,verbose =3)
pd.Series(np.sqrt(-scores)).describe()

In [None]:
lasso_lars_poly_search.best_params_

In [None]:
lasso_lars_poly_paragrid = {
    'imputer__strategy':['mean','median','most_frequent'],
    'poly_features__interaction_only':[True, False],
    'poly_features__include_bias':[True, False],
    'poly_features__degree':randint(low=2, high=4),
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    'lasso_lars__fit_intercept':[True, False],
    'lasso_lars__max_iter':randint(low=500, high=2000),
    'lasso_lars__normalize':[True, False],
    'lasso_lars__precompute':[True, False],
    'lasso_lars__eps':c,
    'lasso_lars__alpha':np.linspace(1.6180e-8,1.6180e-3, num=16),
    'lasso_lars__fit_path':[True, False],
    'lasso_lars__positive':[True, False],
    #'lasso_lars__jitter':randint(low=0.00001, high=0.1),

}

In [None]:

lasso_lars_poly_search = RandomizedSearchCV(polynomial_regression_lasso_lars, param_distributions=lasso_lars_poly_paragrid,
                                n_iter=512, cv=8, scoring='neg_mean_squared_error', random_state=42, verbose=1, n_jobs=-1)
lasso_lars_poly_search.fit(num_attribs, runs_labels)
lasso_lars_poly_search.best_params_

# Voting Regressor & Classifier

In [22]:
voting_reg = VotingRegressor(
    estimators = [('lasso',linear_lasso_lars_regression),
                  ('svr',support_vector_regression),
                  ('svr_lin',lin_support_vector_regression),
                  ('svr_poly',poly_kernel_support_vector_regression),
                  ('forest',forest_regression),
                  ('stoca',stochastic_regression),
                  ('knn',KNN_regression),
                  ('cnn',mlp_regression),
                  ('poly',polynomial_regression),
                  ('poly_ridge',polynomial_regression_ridge) 
                 ],
    voting = 'hard')



NameError: name 'linear_lasso_lars_regression' is not defined

# Bagging Regressor & Classifier

In [None]:
BaggingRegressor(base_estimator=None, 
                 n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, 
                 bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, 
                 random_state=None, verbose=0)