In [2]:
import numpy as np

import pandas as pd
from pandas.plotting import scatter_matrix
from sqlalchemy import create_engine
cnx = create_engine('sqlite:///:memory:')

from scipy.stats import randint
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer 

from sklearn.preprocessing import OrdinalEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVR
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score         #not supporting multiclass
from sklearn.metrics import average_precision_score #not supporting multiclass

pd.set_option('display.max_column',None)
pd.set_option('display.max_row',1000)

pd.options.mode.use_inf_as_na = True
np.random.seed(42)

In [3]:
num_attribs = pd.read_csv("s3://dadadata/iMa/num_attribs.csv")
cat_attribs = pd.read_csv("s3://dadadata/iMa/cat_attribs.csv")
runs_labels = pd.read_csv("s3://dadadata/iMa/runs_labels.csv")

#num_attribs = pd.read_csv("/Users/mk2/Desktop/iMa/Model/num_attribs.csv")
#cat_attribs = pd.read_csv("/Users/mk2/Desktop/iMa/Model/cat_attribs.csv")
#runs_labels = pd.read_csv("/Users/mk2/Desktop/iMa/Model/runs_labels.csv")

num_attribs = num_attribs.drop(columns=["Unnamed: 0"])
runs_labels = runs_labels.drop(columns=["Unnamed: 0"])
cat_attribs = cat_attribs.drop(columns=["Unnamed: 0"])


# Regressions on numerical attributes ZONE !

#Linear Regression on num_attribs

In [31]:
linear_regression = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ("standard_scaler", StandardScaler(with_mean = True, with_std = True)),
    ("linear_reg", LinearRegression(fit_intercept= True, normalize=False)),
])

linear_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(linear_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error",n_jobs=-1, cv=60000,verbose=3)
pd.Series(np.sqrt(-scores)).describe()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 440 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 728 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done 1496 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done 1976 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 2520 tasks      | elapsed:   58.5s
[Parallel(n_jobs=-1)]: Done 3128 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 3800 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 4536 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 5336 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 6200 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 7128 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 8120 tasks      |

count    60000.000000
mean         2.690641
std          1.746462
min          0.000009
25%          1.255395
50%          2.518851
75%          3.877458
max         12.098136
dtype: float64

In [9]:
linear_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    'linear_reg__fit_intercept':[True, False],
    'linear_reg__normalize':[True, False]
}

linear_regression_search = RandomizedSearchCV(linear_regression, param_distributions=linear_regression_paragrid,
                                n_iter=24, cv=5, scoring='neg_mean_squared_error',n_jobs=-1, random_state=42, verbose=3)
linear_regression_search.fit(num_attribs, runs_labels)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    8.3s finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                             ('standard_scaler',
                                              StandardScaler(with_mean=False)),
                                             ('linear_reg',
                                              LinearRegression())]),
                   n_iter=24, n_jobs=-1,
                   param_distributions={'imputer__strategy': ['mean', 'median',
                                                              'most_frequent'],
                                        'linear_reg__fit_intercept': [True,
                                                                      False],
                                        'linear_reg__normalize': [True, False],
                                        'standard_scaler__with_mean': [True,
                                                                       False],
                                   

In [10]:
linear_regression_search.best_params_

{'standard_scaler__with_std': True,
 'standard_scaler__with_mean': True,
 'linear_reg__normalize': False,
 'linear_reg__fit_intercept': True,
 'imputer__strategy': 'mean'}

#Stochastic Gradient Descent Regression

In [30]:
stochastic_regression = Pipeline([
    
    ('imputer', SimpleImputer()),
    ("standard_scaler", StandardScaler()),
    ("sgd_reg", SGDRegressor()),
    
])

stochastic_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(stochastic_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error", cv=100, verbose=3,n_jobs=6)
pd.Series(np.sqrt(-scores)).describe()

  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    1.8s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    5.4s finished


count    100.000000
mean       3.214726
std        0.101341
min        3.019398
25%        3.128934
50%        3.210059
75%        3.284520
max        3.460790
dtype: float64

In [75]:
SGDRegressor.get_params(SGDRegressor).keys()

dict_keys(['alpha', 'average', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter', 'n_iter_no_change', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [37]:
stochastic_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    'sdg_reg__alpha':np.linspace(1.6180e-8,1.6180e-3, num=16),
    'sdg_reg__early_stopping':[True, False],
    'sdg_reg__epsilon':randint(low=0.01, high=1),
    'sdg_reg__eta0':np.linspace(1e-4,1e-1, num=10),
    'sdg_reg__fit_intercept':[True, False],
    'sdg_reg__l1_ratio':randint(low=0, high=1),
    'sdg_reg__learning_rate':['constant','optimal','invscaling','adaptive'],
    'sdg_reg__loss':['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive'],
    'sdg_reg__max_iter':randint(low=500, high=5000),
    'sdg_reg__n_iter':randint(low=1000, high=10000), 
    'sdg_reg__penalty':['l2','l1’','elasticnet'],
    'sdg_reg__power_t':np.linspace(1e-4,1e-1, num=10),
    'sdg_reg__shuffle':[True, False],    
    'sdg_reg__tol':[None,np.linspace(1.6180e-4,1.6180e-1, num=8)],
    'sdg_reg__validation_fraction':np.linspace(1e-3,1e-1, num=10), 
    
}




In [38]:
stochastic_regression_paragrid = RandomizedSearchCV(stochastic_regression, param_distributions=stochastic_regression_paragrid,
                                n_iter=100, cv=5, scoring='neg_mean_squared_error', random_state=42, verbose=3)
stochastic_regression_paragrid.fit(num_attribs, runs_labels)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] imputer__strategy=most_frequent, sdg_reg__alpha=0.00032361294400000005, sdg_reg__early_stopping=True, sdg_reg__epsilon=0, sdg_reg__eta0=0.07780000000000001, sdg_reg__fit_intercept=True, sdg_reg__l1_ratio=0, sdg_reg__learning_rate=constant, sdg_reg__loss=epsilon_insensitive, sdg_reg__max_iter=966, sdg_reg__n_iter=5426, sdg_reg__penalty=elasticnet, sdg_reg__power_t=0.07780000000000001, sdg_reg__shuffle=True, sdg_reg__tol=[0.0001618  0.02325297 0.04634414 0.06943531 0.09252649 0.11561766
 0.13870883 0.1618    ], sdg_reg__validation_fraction=0.07800000000000001, standard_scaler__with_mean=False, standard_scaler__with_std=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: Invalid parameter sdg_reg for estimator Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('standard_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('sgd_reg', SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0....m_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
from sklearn.neural_network import MLPRegressor

mlp_regression = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    #("poly_features", PolynomialFeatures(degree=2,include_bias = False)),
    ("standard_scaler", StandardScaler()),
    ("mlp", MLPRegressor()),
])

mlp_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(mlp_regression, num_attribs, runs_labels.ravel(),scoring="neg_mean_squared_error", n_jobs=-1, cv=10, verbose =3)
pd.Series(np.sqrt(-scores)).describe()

#Polynomial Regression on num_attribs

In [27]:
polynomial_regression = Pipeline([
    
    ('imputer', SimpleImputer(strategy="median")),
    ("poly_features", PolynomialFeatures(degree=2)),
    ("lin_reg_poly", LinearRegression()),
])

polynomial_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(polynomial_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error", cv=1000, n_jobs = 18, verbose =3)
pd.Series(np.sqrt(-scores)).describe()

[Parallel(n_jobs=18)]: Using backend LokyBackend with 18 concurrent workers.
[Parallel(n_jobs=18)]: Done  92 tasks      | elapsed:  1.0min
[Parallel(n_jobs=18)]: Done 252 tasks      | elapsed:  2.5min
[Parallel(n_jobs=18)]: Done 476 tasks      | elapsed:  4.7min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:  7.5min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:  9.7min finished


count    1000.000000
mean        3.148229
std         0.323867
min         2.400870
25%         2.936714
50%         3.143052
75%         3.329512
max         6.652358
dtype: float64

In [None]:
poly_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'lin_reg_poly__fit_intercept':[True, False],
    'lin_reg_poly__normalize':[True, False],
    'poly_features__interaction_only':[True, False],
    'poly_features__include_bias':[True, False],
    #'poly_features__order':['C', 'F'],
    'poly_features__degree':randint(low=2, high=10),
    
    
}

poly_regression_search = RandomizedSearchCV(polynomial_regression, param_distributions=poly_regression_paragrid,
                                n_iter=96, cv=5, scoring='neg_mean_squared_error',n_jobs= 18, random_state=42, verbose=3)
poly_regression_search.fit(num_attribs, runs_labels)

In [41]:
poly_regression_search.best_params_

NameError: name 'poly_regression_search' is not defined

In [45]:
polynomial_regression_ridge = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ("poly_features", PolynomialFeatures(degree=2,include_bias = False)),
    ("standard_scaler", StandardScaler()),
    ("ridge", Ridge()),
])
polynomial_regression_ridge.fit(num_attribs, runs_labels)
scores = cross_val_score(polynomial_regression_ridge, num_attribs, runs_labels,scoring="neg_mean_squared_error",n_jobs=-1, cv=100, verbose =3)
pd.Series(np.sqrt(-scores)).describe()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   49.3s finished


count    100.000000
mean       3.169218
std        0.310701
min        2.872626
25%        3.042888
50%        3.123169
75%        3.206157
max        5.666430
dtype: float64

In [None]:
polynomial_regression_lasso = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ("poly_features", PolynomialFeatures(degree=2,include_bias = False)),
    ("standard_scaler", StandardScaler()),
    ("lasso_lars", LassoLars(warm_start = True)),
])
polynomial_regression_lasso.fit(num_attribs, runs_labels)
scores = cross_val_score(polynomial_regression_lasso, num_attribs, runs_labels,scoring="neg_mean_squared_error", cv=12, verbose =3)
pd.Series(np.sqrt(-scores)).describe()

#Logistic Regression on num_attribs

In [78]:
logistic_regression = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("standard_scaler", StandardScaler()),
    ("log_reg",LogisticRegression(warm_start=True, random_state = 42)),
])

logistic_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(logistic_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error",n_jobs=-1,cv=10, verbose =3)
pd.Series(np.sqrt(-scores)).describe()

  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   27.9s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   33.9s remaining:   14.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   41.2s finished


count    10.000000
mean      4.241997
std       0.069683
min       4.132688
25%       4.216736
50%       4.247908
75%       4.281634
max       4.342551
dtype: float64

In [101]:
logistic_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    #'log_reg__penalty':['l2','l1'],
    'log_reg__solver':['newton-cg','lbfgs','sag','saga'],
    'log_reg__multi_class':['auto','ovr','multinomial'],
    'log_reg__max_iter':randint(low=100, high=5000),
    'log_reg__class_weight':[None,'balanced'],
    #'log_reg__tol':randint(low=0.0001, high=0.1),
    #'log_reg__dual':[True, False],
    'log_reg__C':[10,1,0.1,0.01],
    'log_reg__fit_intercept':[True, False],
    #'log_reg__l1_ratio':randint(low=0, high=1),
 
}


In [102]:
logistic_regression_search = RandomizedSearchCV(logistic_regression, param_distributions=logistic_regression_paragrid,
                                n_iter=100, cv=5, scoring='neg_mean_squared_error',n_jobs=-1, random_state=42, verbose=3)
logistic_regression_search.fit(num_attribs, runs_labels)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


KeyboardInterrupt: 

#Forest Regression on num_attribs

In [106]:
forest_regression = Pipeline([
    
    ("imputer", SimpleImputer(strategy="median")),
    ("standard_scaler", StandardScaler()),
    ("forest_reg",RandomForestRegressor()),
    
])

forest_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(forest_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error", n_jobs=-1, cv=100, verbose = 3)
pd.Series(np.sqrt(-scores)).describe()

  self._final_estimator.fit(Xt, y, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.4min finished


count    100.000000
mean       3.293989
std        0.105041
min        3.013481
25%        3.227219
50%        3.292358
75%        3.359795
max        3.542300
dtype: float64

In [108]:
forest_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'forest_reg__bootstrap': [True, False],
    'forest_reg__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'forest_reg__max_features': ['auto', 'sqrt'],
    'forest_reg__min_samples_leaf': [1, 2, 4],
    'forest_reg__min_samples_split': [2, 5, 10],
    'forest_reg__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
}
   


In [109]:
forest_regression_search = RandomizedSearchCV(forest_regression, param_distributions=forest_regression_paragrid,
                                n_iter=96, cv=8, scoring='neg_mean_squared_error', random_state=42, verbose=3)
forest_regression_search.fit(num_attribs, runs_labels)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


KeyboardInterrupt: 

#Support Vector Regression & Classification on num_attribs

In [None]:
support_vector_regression = Pipeline([
    
    ("imputer", SimpleImputer(strategy="median")),
    ("standard_scaler", StandardScaler()),
    ("svr_reg",SVR(kernel="poly", gamma = 'scale')),
])

support_vector_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(support_vector_regression, num_attribs, runs_labels, scoring="neg_mean_squared_error", cv=10, verbose = 3)
pd.Series(np.sqrt(-scores)).describe()

  y = column_or_1d(y, warn=True)


In [None]:
support_vector_classifier = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("standard_scaler", StandardScaler()),
    ("svr_reg",LinearSVC()),
])

support_vector_classifier.fit(num_attribs, runs_labels)
scores = cross_val_score(support_vector_regression, num_attribs, runs_labels, scoring="neg_mean_squared_error", cv=10, verbose = 3)
pd.Series(np.sqrt(-scores)).describe()

#Lasso LARS discarded

In [None]:
from sklearn.linear_model import LassoLars

linear_lasso_lars_regression = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ("standard_scaler", StandardScaler(with_std = True, with_mean = False)),
    ("lasso_lars", LassoLars(alpha=0.001618, fit_intercept= False,fit_path= False, max_iter= 1869, positive = False, 
                            precompute= True, normalize = True,eps = 0.001618)),
])

linear_lasso_lars_regression.fit(num_attribs, runs_labels)
scores = cross_val_score(linear_lasso_lars_regression, num_attribs, runs_labels,scoring="neg_mean_squared_error",n_jobs=-1, cv=100, verbose=3)
pd.Series(np.sqrt(-scores)).describe()

In [None]:
lasso_lars_regression_paragrid = {
    
    'imputer__strategy':['mean','median','most_frequent'],
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    'lasso_lars__fit_intercept':[True, False],
    'lasso_lars__max_iter':randint(low=500, high=2000),
    'lasso_lars__normalize':[True, False],
    'lasso_lars__precompute':[True, False],
    'lasso_lars__eps':np.linspace(1.6180e-16,1.6180e-8, num=8),
    'lasso_lars__alpha':np.linspace(1.6180e-6,1.6180e-3, num=8),
    'lasso_lars__fit_path':[True, False],
    'lasso_lars__positive':[True, False],
    #'lasso_lars__jitter':[None,1.6180e-8,1.6180e-6, 1.6180e-3],
}

lasso_lars_regression_search = RandomizedSearchCV(linear_lasso_lars_regression, param_distributions=lasso_lars_regression_paragrid,
                                n_iter=400, cv=8, scoring='neg_mean_squared_error',n_jobs=18, random_state=42, verbose=3)
lasso_lars_regression_search.fit(num_attribs, runs_labels)
lasso_lars_regression_search.best_params_

In [None]:
from sklearn.feature_selection import SelectFromModel 

polynomial_regression_lasso_lars = Pipeline([
    
    ('imputer', SimpleImputer(strategy="median")),
    #('features_selection',SelectFromModel(estimator=(LinearRegression()))),
    
    ("poly_features", PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)),
    ("standard_scaler", StandardScaler(with_mean=False, with_std=False)),
    ("lasso_lars", LassoLars(alpha=0.000215747356, eps=4.109206350413429e-07, fit_intercept= True, fit_path= True,
                            max_iter=1399, normalize=False, positive=False, precompute=False)),
])

polynomial_regression_lasso_lars.fit(num_attribs, runs_labels)
scores = cross_val_score(polynomial_regression_lasso_lars, num_attribs, runs_labels,scoring="neg_mean_squared_error", cv=1000,n_jobs=-1 ,verbose =3)
pd.Series(np.sqrt(-scores)).describe()

In [None]:
lasso_lars_poly_search.best_params_

In [None]:
lasso_lars_poly_paragrid = {
    'imputer__strategy':['mean','median','most_frequent'],
    'poly_features__interaction_only':[True, False],
    'poly_features__include_bias':[True, False],
    'poly_features__degree':randint(low=2, high=4),
    'standard_scaler__with_mean':[True, False],
    'standard_scaler__with_std':[True, False],
    'lasso_lars__fit_intercept':[True, False],
    'lasso_lars__max_iter':randint(low=500, high=2000),
    'lasso_lars__normalize':[True, False],
    'lasso_lars__precompute':[True, False],
    'lasso_lars__eps':c,
    'lasso_lars__alpha':np.linspace(1.6180e-8,1.6180e-3, num=16),
    'lasso_lars__fit_path':[True, False],
    'lasso_lars__positive':[True, False],
    #'lasso_lars__jitter':randint(low=0.00001, high=0.1),

}

lasso_lars_poly_search = RandomizedSearchCV(polynomial_regression_lasso_lars, param_distributions=lasso_lars_poly_paragrid,
                                n_iter=512, cv=8, scoring='neg_mean_squared_error', random_state=42, verbose=1, n_jobs=-1)
lasso_lars_poly_search.fit(num_attribs, runs_labels)
lasso_lars_poly_search.best_params_

# Regressions on num_attribs & cat_attribs ZONE !

#Linear Regression on all_attribs

In [68]:
lin_reg = LinearRegression()
lin_reg.fit(runs_prepared, runs_labels)
runs_lin_predictions = lin_reg.predict(runs_prepared)
runs_lin_predictions

array([ 8.33411759,  6.77675034,  6.17999478, ...,  7.99975496,
       11.45489248, 11.01854323])

In [69]:
np.sqrt(mean_squared_error(runs_labels,runs_lin_predictions)), mean_absolute_error(runs_labels,runs_lin_predictions)

(3.2205873943869556, 2.676721668910449)

In [70]:
scores = cross_val_score(lin_reg, runs_prepared, runs_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

count    10.000000
mean      3.294553
std       0.019571
min       3.253311
25%       3.287701
50%       3.293112
75%       3.303017
max       3.326688
dtype: float64

#Forest Regression on all_attribs

In [72]:
forest_reg = RandomForestRegressor(n_estimators = 100, n_jobs = -1, verbose = 1, warm_start=True)
forest_reg.fit(runs_prepared, runs_labels)
runs_forest_predictions = forest_reg.predict(runs_prepared)
runs_forest_predictions

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 10.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.5s finished


array([ 8.12,  8.49,  3.71, ...,  7.3 ,  8.99, 10.58])

In [73]:
np.sqrt(mean_squared_error(runs_labels,runs_forest_predictions)), mean_absolute_error(runs_labels,runs_forest_predictions)

(1.195921977714223, 0.9717486665512847)

In [74]:
scores = cross_val_score(forest_reg, runs_prepared, runs_labels, scoring="neg_mean_squared_error", cv=5, n_jobs = -1, verbose=1)
pd.Series(np.sqrt(-scores)).describe()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 25.6min remaining: 38.3min


KeyboardInterrupt: 

# Parameters optimisation

In [439]:
sgd_reg.get_params()

NameError: name 'penalty' is not defined

In [99]:
param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=10),
    }

rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=100, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs =-1,verbose=2)
rnd_search.fit(runs_prepared, runs_labels)
grid_search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 24.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 38.6min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 189 out of 189 | elapsed:  1.9min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=1, warm_start=True),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a1d97e5c0>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a1d97e080>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=2)

In [None]:
param_distribs       = {'bootstrap': [True, False],
                       'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                       'max_features': ['auto', 'sqrt'],
                       'min_samples_leaf': [1, 2, 4],
                       'min_samples_split': [2, 5, 10],
                       'n_estimators': randint(low=1, high=2000),
                }

rnd_search.fit(runs_prepared, runs_labels)

In [105]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

3.1874521437031924 {'max_features': 7, 'n_estimators': 180}
3.183008754418416 {'max_features': 8, 'n_estimators': 189}
3.201024847920603 {'max_features': 5, 'n_estimators': 103}
3.2284242727622257 {'max_features': 3, 'n_estimators': 75}
3.189927209476519 {'max_features': 8, 'n_estimators': 117}
3.209651237694833 {'max_features': 4, 'n_estimators': 104}
3.190409360145367 {'max_features': 8, 'n_estimators': 131}
3.2114818927824778 {'max_features': 6, 'n_estimators': 53}
3.240449408321639 {'max_features': 2, 'n_estimators': 88}
3.1932028336165876 {'max_features': 6, 'n_estimators': 130}


In [106]:
feature_importances = rnd_search.best_estimator_.feature_importances_
feature_importances

array([4.80494612e-02, 5.44871463e-02, 2.03991352e-02, 6.22302658e-02,
       7.05603464e-02, 5.81942507e-02, 1.96340444e-01, 1.75648352e-01,
       6.37230778e-03, 2.46847326e-04, 8.89058407e-05, 4.64263993e-03,
       2.31059548e-04, 8.55341215e-05, 7.39341863e-03, 2.74474893e-04,
       2.03793348e-04, 8.00584997e-02, 2.87675058e-04, 4.77646672e-04,
       1.13073099e-02, 4.82721341e-05, 3.10153501e-04, 2.28462761e-03,
       6.43754722e-03, 9.63615901e-04, 9.87722841e-05, 7.94683315e-03,
       2.19183245e-04, 2.59242539e-04, 1.13522481e-02, 1.63391620e-03,
       4.25750718e-05, 3.48893971e-03, 4.30966019e-05, 1.06786631e-05,
       2.92380520e-03, 7.92973225e-04, 1.40278705e-04, 4.10135910e-03,
       4.83959406e-05, 1.71457343e-03, 5.52029977e-04, 3.96413436e-04,
       1.08105132e-04, 8.54359190e-06, 5.65213988e-03, 5.41398218e-03,
       1.14382281e-02, 6.26630346e-03, 6.33288064e-03, 5.96061491e-03,
       9.21199380e-03, 9.19154091e-03, 1.70799118e-03, 1.15630292e-02,
      

In [109]:
attributes = cat_attribs + num_attribs
sorted(zip(feature_importances, attributes), reverse=False)

[(4.827213405745772e-05, 'trainer_wins'),
 (8.553412153547558e-05, 'jockey_id'),
 (8.890584067709902e-05, 'horse_wins'),
 (0.00020379334847067635, 'place_odds'),
 (0.00023105954805481354, 'jockey_exp'),
 (0.0002468473259631991, 'horse_type'),
 (0.0002744748934750977, 'jockey_wins_runs'),
 (0.0002876750582285553, 'total_exp'),
 (0.00031015350064023226, 'trainer_wins_runs'),
 (0.0004776466724151761, 'total_weight'),
 (0.0022846276069661245, 'venue'),
 (0.004642639925453949, 'horse_wins_runs'),
 (0.006372307780387764, 'horse_ratings'),
 (0.00643754721639016, 'win_odds'),
 (0.007393418628967594, 'jockey_wins'),
 (0.011307309906009807, 'trainer_exp'),
 (0.020399135172456853, 'draw'),
 (0.048049461154387865, 'actual_weight'),
 (0.05448714631967126, 'config'),
 (0.05819425067584558, 'horse_country'),
 (0.0622302658295924, 'going'),
 (0.0705603464499315, 'horse_age'),
 (0.0800584996685134, 'race_no'),
 (0.1756483521850255, 'horse_no'),
 (0.19634044436457238, 'horse_exp')]