Run gridsearch, plot regression and features importance

on original and simulated data

save results to csv or excel

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os, glob, inspect, sys, pickle, warnings

import xgboost as xgb

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.inspection import permutation_importance

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import epri_mc_lib_2 as mc
from importlib import reload
reload(mc)

In [None]:
# Define best_model:
def best_model(X_train, y_train, steel, name, model):
    '''run standard scaler and gridsearch CV pipeline on models
    Args:
        -model: initiated model 
        -name : name of model as str
    return list of best estimator and table of results
    '''
    X_train = X_train[X_train.index.str.contains(steel)]
    y_train= y_train[y_train.index.str.contains(steel)]

    
    best_model_stack = list()
    results_cv = dict()
    
    def grid_csv(params):
        
        GSCV = GridSearchCV(model, param_grid = params, scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2', 'neg_root_mean_squared_error'], 
                            refit='neg_root_mean_squared_error', 
                            cv = 5, n_jobs=-1, verbose=True)
        best_clf = GSCV.fit(X_train, y_train)
        best_hyperparams = best_clf.best_params_
        best_score = best_clf.best_score_
        estimator = best_clf.best_estimator_
        print(best_score, best_hyperparams, estimator)
        table = best_clf.cv_results_
        results_cv[name] = table
        return estimator
 
        
    if name == 'Ridge':
        params = {'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
                  'alpha' :  np.arange(0.01, 1, 0.05)} 
        best_model_stack.append(grid_csv(params))
    
    if name == 'Elastic':
        params = {'l1_ratio' : [0, 0.25, 0.5, 1], 
                  'alpha' : np.arange(0.01, 1, 0.05)} 
        best_model_stack.append(grid_csv(params))
    
    if name == 'Tree':
        params = {'max_features' : ['auto', 'sqrt', 'log2'],
                  'criterion' : ['mse', 'friedman_mse', 'mae'],
                  'max_depth' : np.arange(3, 15, 1)}
        best_model_stack.append(grid_csv(params))
        
    if name == 'KNN':
        params = {'n_neighbors' : np.arange(5, 50, 5),
                 'weights' : ['uniform', 'distance'],
                 'algorithm' : ['ball_tree', 'kd_tree', 'brute', 'auto']} 
        best_model_stack.append(grid_csv(params))
    
    if name == 'SVM':
        params = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
                 'C' : [0.2, 0.5, 1]} 
        best_model_stack.append(grid_csv(params))


    if name == 'RF': 
        params = {'n_estimators' : np.arange(100, 1000, 200),
                  'max_features' : ['auto', 'sqrt', 'log2'],
                  'criterion' : ['mse', 'mae'],
                  'max_depth' : np.arange(3, 15, 1),
                 } 
        best_model_stack.append(grid_csv(params))
    
    if name == 'XGB':
        params = {'n_estimators' : np.arange(100, 1000, 200),
                  'gamma': np.arange(0.05, 0.8, 0.5),
                  'reg_lambda':[1e-8,  1e-4],
                  'max_depth' : np.arange(3, 10, 2),
                 } 
        best_model_stack.append(grid_csv(params))

        
    return best_model_stack, results_cv

In [None]:
# Define best_model:
def best_model_bayes(X_train, y_train, steel, name, model):
    warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')
    
    '''run standard scaler and gridsearch CV pipeline on models
    Args:
        -model: initiated model 
        -name : name of model as str
    return list of best estimator and table of results
    '''
    X_train = X_train[X_train.index.str.contains(steel)]
    y_train= y_train[y_train.index.str.contains(steel)]

    
    best_model_stack = list()
    results_cv = dict()
    
    def grid_csv(params):
        
        GSCV = BayesSearchCV(model, search_spaces = params, scoring = 'neg_root_mean_squared_error', 
                            cv = 10, n_jobs=-1, verbose=0)
        best_clf = GSCV.fit(X_train, y_train)
        best_hyperparams = best_clf.best_params_
        best_score = best_clf.best_score_
        estimator = best_clf.best_estimator_
        print(best_score, best_hyperparams, estimator)
        table = best_clf.cv_results_
        results_cv[name] = table
        return estimator
        
    if name == 'Ridge':
        params = {'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
                  'alpha' :  np.arange(0.01, 1, 0.05)} 
        best_model_stack.append(grid_csv(params))
    
    if name == 'Elastic':
        params = {'l1_ratio' : [0, 0.25, 0.5, 1], 
                  'alpha' : np.arange(0.01, 1, 0.02)} 
        best_model_stack.append(grid_csv(params))
    
    if name == 'Tree':
        params = {'max_features' : ['auto', 'sqrt', 'log2'],
                  'criterion' : ['mse', 'friedman_mse', 'mae'],
                  'max_depth' : np.arange(3, 15, 1)}
        best_model_stack.append(grid_csv(params))
        
    if name == 'KNN':
        params = {'n_neighbors' : np.arange(5, 100, 5),
                 'weights' : ['uniform', 'distance'],
                 'algorithm' : ['ball_tree', 'kd_tree', 'brute', 'auto']} 
        best_model_stack.append(grid_csv(params))
    
    if name == 'SVM':
        params = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
                 'C' : np.arange(0.01, 1, 0.02)} 
        best_model_stack.append(grid_csv(params))


    if name == 'RF': 
        params = {'n_estimators' : np.arange(100, 1000, 50),
                  'max_features' : ['auto', 'sqrt', 'log2'],
                  'criterion' : ['mse', 'mae'],
                  'max_depth' : np.arange(3, 15, 1),
                  'min_weight_fraction_leaf': np.arange(0, 0.6, 0.2)
                 } 
        best_model_stack.append(grid_csv(params))
    
    if name == 'XGB':
        params = {'n_estimators' : np.arange(100, 1000, 200),
                  'gamma': np.arange(0.05, 0.8, 0.2),
                  'reg_alpha' : [1e-8, 1e-6, 1e-4, 1e-2],
                  'reg_lambda':[1e-8, 1e-6, 1e-4, 1e-2],
                  'max_depth' : np.arange(3, 15, 2),
                  'gamma': np.arange(0.1, 1, 0.2),
                  'subsample': np.arange(0.3, 1, 0.1),
                  'colsample_bytree': np.arange(0.3, 1, 0.1), 
                  'min_child_weight': np.arange(1, 6, 2),
                 } 
        best_model_stack.append(grid_csv(params))

        
    return best_model_stack, results_cv

In [None]:
def train_model(X_train, y_train, X_test, y_test, steel, name, model):
    '''
    predict real values using train models
    Args:
    - steel : id number
    - X_train, X_test : pandas df of features
    - y_train, y_test : pandas df of label, shape 1
    - name : name of the model as str
    - model: model with params
    return y_pred
    '''
    
    X_train = X_train[X_train.index.str.contains(steel)]
    y_train = y_train[y_train.index.str.contains(steel)]
    X_test = X_test[X_test.index.str.contains(steel)]
    y_test = y_test[y_test.index.str.contains(steel)]

    
    if name == 'XGB':
        X_train_df= pd.DataFrame(X_train, columns=feature_names)
        reg = model.fit(X_train_df, y_train)

    else:
        reg = model.fit(X_train, y_train)
        
    y_pred = reg.predict(X_test)
       
    R2_train = r2_score(y_train, reg.predict(X_train))
    R2_test =  r2_score(y_test, y_pred)
    
    RMSE_training = np.sqrt(mean_squared_error(y_train, reg.predict(X_train)))
    RMSE_testing = np.sqrt(mean_squared_error(y_test, y_pred))
    
    score_sample = pd.DataFrame({'RMSE_train': RMSE_training, 'RMSE_test': RMSE_testing,
                                    'R2_train':R2_train, 'R2_test':R2_test,
                                     'model':name, 'type': steel
                                    }, index=[0])
    print(score_sample)
    
    sns.scatterplot(x=y_test, y=y_pred)
    plt.plot([0, 250], [0, 250])
    plt.xlim([0, 250])
    plt.ylim([0, 250])
    plt.xlabel("Fracture Toughness")
    plt.ylabel("Predicted Fracture Toughness")
    plt.title(steel + '\n' + str(model))
    plt.show()
    
    return reg, score_sample

In [None]:
def real_pred(steel, X_val_real, y_val_real, model):
    '''
    predict real values using train models
    Args:
    - steel : id number
    - X_val_real : pandas df of features
    - y_val_real : pandas df of label, shape 1
    - model: trained model
    return y_pred
    '''
    X_val_real = X_val_real[X_val_real.index.str.contains(steel)]
    y_val_real = y_val_real[y_val_real.index.str.contains(steel)]

    reg_real = model.predict(X_val_real)

    print("r2 score for testing: ", r2_score(y_val_real, reg_real))
    print("RMSE score for testing: ", np.sqrt(mean_squared_error(y_val_real, reg_real)))
    
    score_real = pd.DataFrame({'R2': r2_score(y_val_real, reg_real), 'RMSE': np.sqrt(mean_squared_error(y_val_real, reg_real)),
                                     'model':name, 'type': steel
                                    }, index=[0])

    sns.set(style='white')
    sns.scatterplot(x=y_val_real, y=reg_real)
    plt.plot([0, 250], [0, 250])
    plt.xlim([0, 250])
    plt.ylim([0, 250])
    plt.xlabel("Fracture Toughness")
    plt.ylabel("Predicted Fracture Toughness")
    plt.title(steel + ', ' + name)
    plt.show()
    
    return reg_real, score_real

In [None]:
def load_data_real(path, scaler):
    '''
    load_data for consistency columns in analyses.
    Args:
    - path : path to csv file
    - scaler :sklearn scaler
    return X, y
    '''

    df = pd.read_csv(path, index_col=0)
    df.index = df.index.str.rstrip('-12345')
    df = df.groupby('ID').mean()
    df['log_MS_Avg'] = np.log(df['MS_Avg'])
    df['log_beta_avg'] = np.log(df['Beta_avg']) 
    df = df[mc.regression_cols].dropna(how='any')
    y_val_real = df['KJIC']
    X_val_real = df.drop(columns=['KJIC'])
    X_val_real = pd.DataFrame(scaler.transform(X_val_real), columns=X_val_real.columns, index=X_val_real.index)
    return  X_val_real, y_val_real

In [None]:
# Define basic features importances:
def get_feature_importance(name, model, feature_names):
    '''return classical feature importances
    Args:
        -name:str
        -model: trained model
    return importance as a df    
    '''
    if name == 'Ridge' or name == 'Elastic':
        importance = model.coef_
        importance_df = pd.DataFrame(importance.T, columns=[name], index=feature_names)
        importance_df.sort_values(name, ascending=True, inplace=True)
       
    if name == 'KNN' or name == 'SVM':
        pass
        
    if name == 'RF' or name == 'Tree': 
        importance = model.feature_importances_
        rel_importance = 100.0 * (importance / importance.sum())
        importance_df = pd.DataFrame(rel_importance.T, columns=[name], index=feature_names)
        importance_df.sort_values(name, ascending=True, inplace=True)
 
    if name == 'XGB':
        importance = model.feature_importances_
        rel_importance = 100.0 * (importance / importance.sum())
        importance_df = pd.DataFrame(rel_importance.T, columns=['XGB'], index=feature_names)
        importance_df.sort_values('XGB', ascending=True, inplace=True)
         
    return importance_df

### Import df and train test split

In [None]:
X_train_ori, X_test_ori, y_train_ori, y_test_ori, scaler = mc.load_data(os.path.join(os.path.dirname(os.getcwd()),'../Data/Merged_data/CopulaGAN_simulated_data_up.csv'),
                                               MinMaxScaler())

In [None]:
# get feature names
feature_names=list(X_train_ori)

### Dummy regressor on all steel

In [None]:
# Evaluate naive

naive = DummyRegressor(strategy='median')
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(naive, X_train_ori, y_train_ori, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('Baseline: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

### GridSearch CV

In [None]:
# Create model_dict

model_GSCV = dict()

model_GSCV['Ridge'] = Ridge()
model_GSCV['Elastic'] = ElasticNet()
model_GSCV['Tree'] = DecisionTreeRegressor()
model_GSCV['KNN'] = KNeighborsRegressor()
model_GSCV['SVM'] = SVR()
model_GSCV['RF'] = RandomForestRegressor()
model_GSCV['XGB'] = xgb.XGBRegressor(objective= 'reg:squarederror',
                        eval_metric = 'rmse',
                        learning_rate = 0.01, 
                        nthread=4,
                        seed=42)


#### Bayesian search

In [None]:
all_results_bayes=dict()
for steel in set(X_train_ori.index.str.split('-').str[0]):
    print(steel)
    results_best_model_bayes = list()
    scoring_bayes = dict()
    for name, model in model_GSCV.items():

        scores = best_model_bayes(X_train_ori, y_train_ori, steel, name, model)
        results_best_model_bayes.append(scores[0][0])
        scoring_bayes[name] = pd.DataFrame(scores[1][name])
        
    all_results_bayes[steel] = results_best_model_bayes

    #save params

    with open('Results_CV/Bayes_all_results_CV.p', 'wb') as fp:
        pickle.dump(all_results_bayes, fp, protocol=pickle.HIGHEST_PROTOCOL)


#### Classic gridsearch

In [None]:
all_results=dict()
for steel in set(X_train_ori.index.str.split('-').str[0]):
    print(steel)
    results_best_model = list()
    scoring = dict()
    for name, model in model_GSCV.items():

        scores = best_model(X_train_ori, y_train_ori, steel, name, model)
        results_best_model.append(scores[0][0])
        scoring[name] = pd.DataFrame(scores[1][name])

    #save params
    with pd.ExcelWriter(os.path.join(os.getcwd(), 'Results_CV/' + steel + '_result_CV.xlsx')) as writer:
        for df_name, df in scoring.items():
            df.to_excel(writer, sheet_name=df_name) 
    all_results[steel] = results_best_model
    with open('Results_CV/all_results_CV.p', 'wb') as fp:
        pickle.dump(all_results, fp, protocol=pickle.HIGHEST_PROTOCOL)


If you need to add another regressor:

create a new dictionary in the for loop as for example all_results_ridge dict and comment out other regressors in the model_GSCV dict 
then the results and rerun pickle save:


`for k, v in dict_results.items():
    if k in all_results_ridge.keys():
        all_results_ridge[k] += v
    else:
        all_results_ridge[k] = v`


## Regression

In [None]:
with open('Results_CV/Bayes_all_results_CV.p', 'rb') as fp:
    dict_results = pickle.load(fp)
    print(dict_results)

In [None]:
all_metricsel_names = ['Ridge', 'Elastic', 'Tree', 'KNN', 'SVM', 'RF', 'XGB']
all_regressors=dict()
metrics = pd.DataFrame()

for steel in set(X_train_ori.index.str.split('-').str[0]):
    
    # Create model_dict
    list_results = dict_results[steel]
    models = dict(zip(model_names, list_results))
    
    # Fit models
    regressors = list()
    for name, model in models.items():
        reg, pred = train_model(X_train_ori, y_train_ori, X_test_ori, y_test_ori, steel, name, model)
        regressors.append(reg)
        metrics = metrics.append(pred, ignore_index=True)
    all_regressors[steel] = regressors
    
    
    #save params
    
    with open('Results_CV/all_regressors.p', 'wb') as fp:
        pickle.dump(all_regressors, fp, protocol=pickle.HIGHEST_PROTOCOL)

metrics.to_csv('Results_reg/RMSE_R2_train_test_simulated_data.csv')
    

In [None]:
model_cat = pd.api.types.CategoricalDtype(categories=['Ridge', 'Elastic', 'Tree', 'KNN', 'SVM', 'RF', 'XGB'],
                            ordered=True)

In [None]:
RMSE_df = metrics.melt(id_vars=['model', 'type'], 
                           value_name='RMSE', 
                           value_vars=['RMSE_train', 'RMSE_test'])
RMSE_df.model = RMSE_df.model.astype(model_cat)
RMSE_df = RMSE_df[RMSE_df.type != 'A286']

sns.set(style='whitegrid')
g = sns.relplot(x='model', y='RMSE', kind='line', data = RMSE_df, hue = 'variable',
           col='type', col_wrap=2,
           facet_kws={'sharex': False, 'sharey': False})
for yaxes in g.fig.get_axes():
    yaxes.invert_yaxis()

In [None]:
RMSE_df = metrics.melt(id_vars=['model', 'type'], 
                           value_name='R2 score', 
                           value_vars=['R2_train', 'R2_test'])
RMSE_df.model = RMSE_df.model.astype(model_cat)

RMSE_df = RMSE_df[RMSE_df.type != 'A286']

sns.set(style='whitegrid')
g = sns.relplot(x='model', y='R2 score', kind='line', data = RMSE_df, hue = 'variable',
           col='type', col_wrap=2,
           facet_kws={'sharex': False, 'sharey': False})


# Test on real data

In [None]:
with open('Results_CV/all_regressors.p', 'rb') as fp:
    dict_regressors = pickle.load(fp)
    print(dict_regressors)

In [None]:
X_val_real, y_val_real = load_data_real(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/MERGE_FT_TEP_UT_on_ID.csv'),
               scaler) # using the scaler from the load_data

In [None]:
real_metrics = pd.DataFrame()

for steel in set(X_train_ori.index.str.split('-').str[0]):
    
    pred = dict()
    # Create model_dict
    list_reg = dict_regressors[steel]
    models = dict(zip(model_names, list_reg))
    
    for name, model in models.items():
        reg_real, score_real= real_pred(steel, X_val_real, y_val_real, model)
        
        pred['True_FT'] = y_val_real[y_val_real.index.str.contains(steel)]
        pred[name] = reg_real
        real_metrics = real_metrics.append(score_real, ignore_index=True)
    
    pd.DataFrame(pred).to_csv('Results_reg/prediction_original_data_' + steel +  '.csv')
    #save params
real_metrics.to_csv('Results_reg/RMSE_R2_original_data.csv')


In [None]:
RMSE_real_df = all_real_metrics.melt(id_vars=['model', 'type'], 
                           #value_name=['R2 score', 'RMSE'], 
                           value_vars=['R2', 'RMSE'])
RMSE_real_df.model = RMSE_real_df.model.astype(model_cat)

RMSE_real_df = RMSE_real_df[RMSE_real_df.type != 'A286']

sns.set(style='whitegrid')
g = sns.relplot(x='model', y='value', kind='line', data = RMSE_real_df, row = 'variable',
           col='type', facet_kws={'sharex': False, 'sharey': False})

# Feature importance

In [None]:
model_names = ['Ridge', 'Elastic', 'Tree', 'KNN', 'SVM', 'RF', 'XGB']
sns.set(style='white')
for steel in set(X_train_ori.index.str.split('-').str[0]):
        
    # Create model_dict
    list_results = dict_results[steel]
    models = dict(zip(model_names, list_results))

    for name, model in models.items():
        if name == 'KNN' or name == 'SVM':
            continue
        classic = get_feature_importance(name, model, feature_names)
        classic.plot.barh(figsize=(5,5), color=[sns.color_palette(palette='PuBu', n_colors=len(feature_names))], 
                          legend=False, title=name + ', ' + steel)
       
        if name=='Ridge' or  name=='Elastic' or name=='SVM':
            plt.xlabel('coefficients')

        if name=='RF' or name == 'Tree':
            plt.xlabel('relative gini importance') 
            
        if name=='XGB':
            plt.xlabel('relative weight importance')

