In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os, glob, inspect, sys

import xgboost as xgb

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.inspection import permutation_importance

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import epri_mc_lib_2 as mc
from importlib import reload
reload(mc)

<module 'epri_mc_lib_2' from '/home/marie-anne/code/Oct20_EPRI/Task2/NB/epri_mc_lib_2.py'>

In [3]:
#Import dfs
data = mc.load_data(os.path.join(os.path.dirname(os.getcwd()),'../Data/Merged_data/CopulaGAN_simulated_data.csv'))

In [4]:
data.head()

Unnamed: 0_level_0,KJIC,log_beta_avg,TEP_average,log_MS_Avg,PC_IF_2.25MHz,PC_IF_3.5MHz,PC_BS
type_cw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A286-80,0.170309,0.390684,0.387439,0.020544,0.4289459,0.174966,0.060829
A286-80,0.076883,0.211567,0.299858,0.059858,0.5438486,0.624207,0.046113
A286-0,0.722217,0.418086,0.555533,0.068867,1.141398e-15,0.088545,0.692367
304-80,0.225209,0.632568,0.702281,0.955842,0.09454126,0.158487,0.615641
304-40,0.409809,0.239343,0.572501,0.726706,0.04748123,0.458641,0.330674


### Train test split

In [5]:
X = data.iloc[:, 1:]
y= data.iloc[:,0]

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42
                                                 )
# get feature names
feature_names=list(X_train)

#check shape
print(X.shape)
X_train.shape

(1000, 6)


(800, 6)

### Dummy regressor on all steel

In [6]:
# Evaluate naive

naive = DummyRegressor(strategy='median')
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(naive, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('Baseline: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Baseline: -0.175 (0.021)


### GridSearch CV

In [7]:
# Create model_dict

model_GSCV = dict()

model_GSCV['Elastic'] = ElasticNet()
model_GSCV['Tree'] = DecisionTreeRegressor()
model_GSCV['KNN'] = KNeighborsRegressor()
model_GSCV['SVM'] = SVR()
model_GSCV['RF'] = RandomForestRegressor()
model_GSCV['XGB'] = xgb.XGBRegressor(objective= 'reg:squarederror',
                        eval_metric = 'rmse',
                        learning_rate = 0.01, 
                        nthread=4,
                        seed=42)


In [8]:
# Define best_model:
def best_model(data, steel, name, model):
    '''run standard scaler and gridsearch CV pipeline on models
    Args:
        -model: initiated model 
        -name : name of model as str
    return list of best estimator and table of results
    '''
    X = data[data.index.str.contains(steel)].iloc[:, 1:]
    y= data[data.index.str.contains(steel)].iloc[:,0]

    X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=42
                                                     )
    # get feature names
    feature_names=list(X_train)

    #check shape
    print(X.shape)
    X_train.shape
    
    best_model_stack = list()
    results_cv = dict()
    def grid_csv(params):
        
        GSCV = GridSearchCV(model, param_grid = params, scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2', 'neg_root_mean_squared_error'], 
                            refit='neg_root_mean_squared_error', 
                            cv = 10, n_jobs=-1, verbose=True)
        best_clf = GSCV.fit(X_train, y_train)
        best_hyperparams = best_clf.best_params_
        best_score = best_clf.best_score_
        estimator = best_clf.best_estimator_
        print(best_score, best_hyperparams, estimator)
        table = best_clf.cv_results_
        results_cv[name] = table
        return estimator
    
    if name == 'Elastic':
        params = {'l1_ratio' : [0, 0.25, 0.5, 1], 
                  'alpha' : [0, 0.5, 1, 2]} 
        best_model_stack.append(grid_csv(params))
    
    if name == 'Tree':
        params = {'max_features' : ['auto', 'sqrt', 'log2'],
                  'criterion' : ['mse', 'friedman_mse', 'mae'],
                  'max_depth' : np.arange(5, 15, 1)}
        best_model_stack.append(grid_csv(params))
        
    if name == 'KNN':
        params = {'n_neighbors' : np.arange(5, 25, 5),
                 'weights' : ['uniform', 'distance'],
                 'algorithm' : ['ball_tree', 'kd_tree', 'brute', 'auto']} 
        best_model_stack.append(grid_csv(params))
    
    if name == 'SVM':
        params = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
                 'C' : [0.2, 0.5, 1]} 
        best_model_stack.append(grid_csv(params))


    if name == 'RF': 
        params = {'n_estimators' : np.arange(100, 200, 50),
                  'max_features' : ['auto', 'sqrt', 'log2'],
                  'criterion' : ['mse', 'mae'],
                  'max_depth' : np.arange(5, 15, 1),
                 } 
        best_model_stack.append(grid_csv(params))
    
    if name == 'XGB':
        params = {'n_estimators' : np.arange(500, 2000, 250),
                  'gamma': np.arange(0.1, 1, 0.5),
                  'reg_lambda':[1e-8,  1e-4],
                  'max_depth' : np.arange(5, 15, 2),
                 } 
        best_model_stack.append(grid_csv(params))

        
    return best_model_stack, results_cv

In [9]:
all_results=dict()
for steel in set(X.index.str.split('-').str[0]):
    print(steel)
    results_best_model = list()
    scoring = dict()
    for name, model in model_GSCV.items():

        scores = best_model(data, steel, name, model)
        results_best_model.append(scores[0][0])
        scoring[name] = pd.DataFrame(scores[1][name])

    #save params
    with open(os.path.join(os.getcwd(), 'Results_CV/' + steel + '_result_CV.txt'), 'w') as file:
        file.write(str(results_best_model))

    with pd.ExcelWriter(os.path.join(os.getcwd(), 'Results_CV/' + steel + '_result_CV.xlsx')) as writer:
        for df_name, df in scoring.items():
            df.to_excel(writer, sheet_name=df_name) 
    all_results[steel] = results_best_model

304
(267, 6)
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.4s finished
  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.07973049184419759 {'alpha': 0, 'l1_ratio': 0} ElasticNet(alpha=0, l1_ratio=0)
(267, 6)
Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.08503580984748546 {'criterion': 'friedman_mse', 'max_depth': 5, 'max_features': 'log2'} DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,
                      max_features='log2')
(267, 6)
Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.07223108863647813 {'algorithm': 'ball_tree', 'n_neighbors': 20, 'weights': 'uniform'} KNeighborsRegressor(algorithm='ball_tree', n_neighbors=20)
(267, 6)
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.07487829996468323 {'C': 0.5, 'kernel': 'rbf'} SVR(C=0.5)
(267, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.07392073521142135 {'criterion': 'mse', 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 150} RandomForestRegressor(max_depth=5, max_features='log2', n_estimators=150)
(267, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed: 25.7min finished


-0.08066497809473835 {'gamma': 0.1, 'max_depth': 5, 'n_estimators': 500, 'reg_lambda': 1e-08} XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0.1, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=4, nthread=4,
             num_parallel_tree=1, random_state=42, reg_alpha=0,
             reg_lambda=1e-08, scale_pos_weight=1, seed=42, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
347
(257, 6)
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.3s finished
  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.0896263604895046 {'alpha': 0, 'l1_ratio': 0} ElasticNet(alpha=0, l1_ratio=0)
(257, 6)
Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.07010278424400937 {'criterion': 'mae', 'max_depth': 6, 'max_features': 'sqrt'} DecisionTreeRegressor(criterion='mae', max_depth=6, max_features='sqrt')
(257, 6)
Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.06075575437241395 {'algorithm': 'ball_tree', 'n_neighbors': 20, 'weights': 'distance'} KNeighborsRegressor(algorithm='ball_tree', n_neighbors=20, weights='distance')
(257, 6)
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.06906438172923558 {'C': 0.5, 'kernel': 'rbf'} SVR(C=0.5)
(257, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  1.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.06010475003147238 {'criterion': 'mse', 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 100} RandomForestRegressor(max_depth=7, max_features='sqrt')
(257, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed: 22.9min finished


-0.0670481414301956 {'gamma': 0.1, 'max_depth': 5, 'n_estimators': 500, 'reg_lambda': 0.0001} XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0.1, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=4, nthread=4,
             num_parallel_tree=1, random_state=42, reg_alpha=0,
             reg_lambda=0.0001, scale_pos_weight=1, seed=42, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
A286
(227, 6)
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.2s finished
  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.0815615293154635 {'alpha': 0, 'l1_ratio': 0} ElasticNet(alpha=0, l1_ratio=0)
(227, 6)
Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.0705281868013791 {'criterion': 'mae', 'max_depth': 7, 'max_features': 'sqrt'} DecisionTreeRegressor(criterion='mae', max_depth=7, max_features='sqrt')
(227, 6)
Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.05689043604504949 {'algorithm': 'ball_tree', 'n_neighbors': 15, 'weights': 'uniform'} KNeighborsRegressor(algorithm='ball_tree', n_neighbors=15)
(227, 6)
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done 105 out of 120 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.06287603071772671 {'C': 0.2, 'kernel': 'rbf'} SVR(C=0.2)
(227, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   38.5s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  1.1min finished


-0.0614452793535093 {'criterion': 'mae', 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 150} RandomForestRegressor(criterion='mae', max_depth=5, max_features='log2',
                      n_estimators=150)
(227, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed: 22.5min finished


-0.06379062738109365 {'gamma': 0.1, 'max_depth': 5, 'n_estimators': 500, 'reg_lambda': 1e-08} XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0.1, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=4, nthread=4,
             num_parallel_tree=1, random_state=42, reg_alpha=0,
             reg_lambda=1e-08, scale_pos_weight=1, seed=42, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
316
(249, 6)
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.3s finished
  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.07087709661658369 {'alpha': 0, 'l1_ratio': 0} ElasticNet(alpha=0, l1_ratio=0)
(249, 6)
Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.05688191171916849 {'criterion': 'friedman_mse', 'max_depth': 5, 'max_features': 'log2'} DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,
                      max_features='log2')
(249, 6)
Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.050676280277032505 {'algorithm': 'ball_tree', 'n_neighbors': 20, 'weights': 'distance'} KNeighborsRegressor(algorithm='ball_tree', n_neighbors=20, weights='distance')
(249, 6)
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.05767511355157756 {'C': 1, 'kernel': 'rbf'} SVR(C=1)
(249, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  1.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.05066641748484337 {'criterion': 'mse', 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 150} RandomForestRegressor(max_depth=5, max_features='sqrt', n_estimators=150)
(249, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed: 23.9min finished


-0.06151612695826205 {'gamma': 0.1, 'max_depth': 5, 'n_estimators': 1250, 'reg_lambda': 0.0001} XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0.1, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1250, n_jobs=4, nthread=4,
             num_parallel_tree=1, random_state=42, reg_alpha=0,
             reg_lambda=0.0001, scale_pos_weight=1, seed=42, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)


In [1]:
all_results

NameError: name 'all_results' is not defined

In [163]:
for steel in set(X.index.str.split('-').str[0]):

    with open('Results_CV/' + steel + '_result_CV.txt', 'r') as file:
        print(list(file))

["[ElasticNet(alpha=0, l1_ratio=0), DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n", "                      max_features='sqrt'), KNeighborsRegressor(algorithm='ball_tree', n_neighbors=20, weights='distance'), SVR(C=1), RandomForestRegressor(max_depth=6, max_features='log2')]"]
["[ElasticNet(alpha=0, l1_ratio=0), DecisionTreeRegressor(criterion='mae', max_depth=7, max_features='log2'), KNeighborsRegressor(algorithm='ball_tree', n_neighbors=20, weights='distance'), SVR(C=0.5), RandomForestRegressor(max_depth=6, max_features='log2')]"]
["[ElasticNet(alpha=0, l1_ratio=0), DecisionTreeRegressor(criterion='mae', max_depth=5, max_features='sqrt'), KNeighborsRegressor(algorithm='ball_tree', n_neighbors=20), SVR(C=0.5), RandomForestRegressor(criterion='mae', max_depth=5, max_features='sqrt',\n", '                      n_estimators=150)]']
["[ElasticNet(alpha=0, l1_ratio=0), DecisionTreeRegressor(max_depth=6, max_features='sqrt'), KNeighborsRegressor(algorithm='ball_tree', n_nei

In [11]:
for steel in set(X.index.str.split('-').str[0]):

    with open(os.path.join(os.getcwd(), 'Results_CV/' + steel + '_result_CV.txt'), 'r') as file:
        print(file.read())

[ElasticNet(alpha=0, l1_ratio=0), DecisionTreeRegressor(criterion='mae', max_depth=7, max_features='sqrt'), KNeighborsRegressor(algorithm='ball_tree', n_neighbors=15), SVR(C=0.2), RandomForestRegressor(criterion='mae', max_depth=5, max_features='log2',
                      n_estimators=150), XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0.1, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=4, nthread=4,
             num_parallel_tree=1, random_state=42, reg_alpha=0,
             reg_lambda=1e-08, scale_pos_weight=1, seed=42, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)]
[ElasticNet(alpha=0, l1_ratio=0), DecisionTreeRegressor(criterion='friedman_ms

In [None]:
# Create model_dict
models = dict()

models['Elastic'] = ElasticNet('alpha': 0, 'l1_ratio': 0)
models['Tree'] = DecisionTreeRegressor()
models['KNN'] = KNeighborsRegressor()
models['SVM'] = SVR()
models['RF'] = RandomForestRegressor()

## Regression

In [14]:
def train_model(data, steel, name, model):
    X = data[data.index.str.contains(steel)].iloc[:, 1:]
    y= data[data.index.str.contains(steel)].iloc[:,0]

    X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=42
                                                     )
   
    if name == 'XGB':
        X_train_df= pd.DataFrame(X_train, columns=feature_names)
        clf = model.fit(X_train_df, y_train)
    else:
        clf = model.fit(X_train, y_train)
    
    return clf

In [15]:
# Fit models
regressors = dict()
for steel in set(X.index.str.split('-').str[0]):
    print(steel)
    for name, model in models.items():
        regressors[name] = train_model(data, steel, name, model)

Parameters: { metrics } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [62]:
for steel in set(X.index.str.split('-').str[0]):
    print(steel)
    results_best_model = list()
    scoring = dict()
    for name, model in model_GSCV.items():

        scores = best_model(data, steel, name, model)
        results_best_model.append(scores[0])
        scoring[name] = pd.DataFrame(scores[1][name])

    #save params
    with open(os.path.join(os.path.dirname(os.getcwd()), '/Results/' + steel + '_result_CV.csv'), 'w') as file:
        file.write(str(results_best_model))

    with pd.ExcelWriter(os.path.join(os.path.dirname(os.getcwd()), '/Results/' + steel + '_result_CV.xlsx')) as writer:
        for df_name, df in scoring.items():
            df.to_excel(writer, sheet_name=df_name) 

316
(249, 6)
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    1.5s finished
  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.07087709661658369 {'alpha': 0, 'l1_ratio': 0} ElasticNet(alpha=0, l1_ratio=0)
(249, 6)
Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.05958007800562062 {'criterion': 'mse', 'max_depth': 5, 'max_features': 'log2'} DecisionTreeRegressor(max_depth=5, max_features='log2')
(249, 6)
Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 105 out of 120 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.050676280277032505 {'algorithm': 'ball_tree', 'n_neighbors': 20, 'weights': 'distance'} KNeighborsRegressor(algorithm='ball_tree', n_neighbors=20, weights='distance')
(249, 6)
Fitting 10 folds for each of 12 candidates, totalling 120 fits
-0.05767511355157756 {'C': 1, 'kernel': 'rbf'} SVR(C=1)
(249, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  1.0min finished


-0.05038353321190096 {'criterion': 'mse', 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 100} RandomForestRegressor(max_depth=7, max_features='sqrt')
347
(257, 6)
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.2s finished
  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.0896263604895046 {'alpha': 0, 'l1_ratio': 0} ElasticNet(alpha=0, l1_ratio=0)
(257, 6)
Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.07444422585939163 {'criterion': 'mae', 'max_depth': 5, 'max_features': 'sqrt'} DecisionTreeRegressor(criterion='mae', max_depth=5, max_features='sqrt')
(257, 6)
Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.06075575437241395 {'algorithm': 'ball_tree', 'n_neighbors': 20, 'weights': 'distance'} KNeighborsRegressor(algorithm='ball_tree', n_neighbors=20, weights='distance')
(257, 6)
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done 105 out of 120 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.06906438172923558 {'C': 0.5, 'kernel': 'rbf'} SVR(C=0.5)
(257, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  1.2min finished


-0.05974730414559981 {'criterion': 'mse', 'max_depth': 6, 'max_features': 'log2', 'n_estimators': 150} RandomForestRegressor(max_depth=6, max_features='log2', n_estimators=150)
304
(267, 6)
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.2s finished
  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.07973049184419759 {'alpha': 0, 'l1_ratio': 0} ElasticNet(alpha=0, l1_ratio=0)
(267, 6)
Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.08599146120941852 {'criterion': 'mae', 'max_depth': 5, 'max_features': 'auto'} DecisionTreeRegressor(criterion='mae', max_depth=5, max_features='auto')
(267, 6)
Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.07223108863647813 {'algorithm': 'ball_tree', 'n_neighbors': 20, 'weights': 'uniform'} KNeighborsRegressor(algorithm='ball_tree', n_neighbors=20)
(267, 6)
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.07487829996468323 {'C': 0.5, 'kernel': 'rbf'} SVR(C=0.5)
(267, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  1.2min finished


-0.07400169581263163 {'criterion': 'mse', 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 150} RandomForestRegressor(max_depth=5, max_features='log2', n_estimators=150)
A286
(227, 6)
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.2s finished
  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.0815615293154635 {'alpha': 0, 'l1_ratio': 0} ElasticNet(alpha=0, l1_ratio=0)
(227, 6)
Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.06886182371766462 {'criterion': 'mae', 'max_depth': 5, 'max_features': 'log2'} DecisionTreeRegressor(criterion='mae', max_depth=5, max_features='log2')
(227, 6)
Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


-0.05689043604504949 {'algorithm': 'ball_tree', 'n_neighbors': 15, 'weights': 'uniform'} KNeighborsRegressor(algorithm='ball_tree', n_neighbors=15)
(227, 6)
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done 105 out of 120 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


-0.06287603071772671 {'C': 0.2, 'kernel': 'rbf'} SVR(C=0.2)
(227, 6)
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  1.2min finished


-0.06050935743966529 {'criterion': 'mae', 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 100} RandomForestRegressor(criterion='mae', max_depth=5, max_features='sqrt')
