Regression on simulated data (Linear, Ridge, ElasticNet, Lasso), no tuning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster import hierarchy
from sklearn.preprocessing import MinMaxScaler
import os, glob, inspect, sys
import re

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import epri_mc_lib_2 as mc
from importlib import reload
reload(mc)

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
# Define basic features importances:
def get_feature_importance(name, model, feature_names):
    '''return classical feature importances
    Args:
        -name:str
        -model: trained model
    return importance as a df    
    '''
    if name == 'Ridge' or name == 'Elastic' or name == 'Lasso':
        importance = model.coef_
        importance_df = pd.DataFrame(importance.T, columns=[name], index=feature_names)
        importance_df.sort_values(name, ascending=True, inplace=True)
       
    if name == 'KNN' or name == 'SVM':
        pass
        
    if name == 'RF' or name == 'Tree': 
        importance = model.feature_importances_
        rel_importance = 100.0 * (importance / importance.sum())
        importance_df = pd.DataFrame(rel_importance.T, columns=[name], index=feature_names)
        importance_df.sort_values(name, ascending=True, inplace=True)
 
    if name == 'XGB':
        importance = model.feature_importances_
        rel_importance = 100.0 * (importance / importance.sum())
        importance_df = pd.DataFrame(rel_importance.T, columns=['XGB'], index=feature_names)
        importance_df.sort_values('XGB', ascending=True, inplace=True)
         
    return importance_df

### Import data and merge replicates

In [None]:
merged_simulated = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/CopulaGAN_simulated_data_up.csv'),
                    index_col=0)


## Select columns of interest

In [None]:
mean_df = merged_simulated.copy()
cw_regex = re.compile("[0-9]+$") 
cold_work = [str(re.search(cw_regex,x).group()) for x in mean_df.type_cw ]
mean_df['KJIC'] = mean_df.index
cw_regex = re.compile("^[A]*[0-9]+") 
mean_df.index = [str(re.search(cw_regex,x).group()) for x in mean_df.type_cw ]
mean_df = mean_df[["KJIC","MS_Avg","TEP_average","Beta_avg","IF_amp_2.25MHz","IF_amp_3.5MHz","BS_amp"]]
mean_df['log_MS_Avg'] = np.log(mean_df['MS_Avg'])
mean_df['log_beta_avg'] = np.log(mean_df['Beta_avg']) 
log_kjic = np.log(mean_df.KJIC)
mean_kjic = mean_df.KJIC
mean_df.drop(columns=['KJIC','MS_Avg','Beta_avg'], inplace=True)


## Scale data

In [None]:
scaled_df = mc.scale_general(mean_df, MinMaxScaler())[0]
scaled_df.index = mean_df.index
#scaled_df["cold_work"] = cold_work
# The logarithmic of the KJIC is incorporated for better results
scaled_kjic = mc.scale_general(pd.DataFrame(mean_kjic), MinMaxScaler())[0]
scaled_kjic.index = mean_df.index
scaled_df['KJIC'] = scaled_kjic


In [None]:
sns.heatmap(scaled_df.corr())

In [None]:
# Seperating data depending on the type of steel
SS_304=scaled_df[scaled_df.index == '304']
SS_316=scaled_df[scaled_df.index == '316']
SS_347=scaled_df[scaled_df.index == '347']
SS_A286=scaled_df[scaled_df.index == 'A286']

In [None]:
SS_list=["SS304","SS316","SS347","SSA286"]

## Linear Regression

In [None]:
def lr_sstype(scaled_df,name):
    # Creating Predictor variable 'X' and Target Variable 'y'
    # X contains all the features except for the target value Price
    X = scaled_df.drop('KJIC', axis = 1)
    y = scaled_df['KJIC']
    
    # Creating the training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=50)
    
    # X_train contains 70% of total dataset
    print("Training dataset:", X_train.shape)
    # X_test contains 30% of total dataset
    print("Test dataset:", X_test.shape)
    
    # Model Liner Regression
    lr = LinearRegression()
    
    #Train/fit the model to training data
    lr.fit(X_train,y_train)
    
    pred = lr.predict(X_test)
    
    # print the intercept
    print("Intercept:", lr.intercept_)
    
    #Coefficients
    coeff_df = pd.DataFrame(lr.coef_,X.columns,columns=['Coefficient'])
    print("Coefficients:", coeff_df)
    
    
    print("r2 score for training: ", r2_score(y_train, lr.predict(X_train)))
    print("r2 score for testing: ", r2_score(y_test, pred))
    
    print('MAE:', metrics.mean_absolute_error(y_test, pred))
    print('MSE:', metrics.mean_squared_error(y_test, pred))
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))
    
    plt.scatter(y_test,pred)
    plt.title("Linear Regression of "+ name)
    plt.xlabel("Fracture Toughness")
    plt.ylabel("Predicted Fracture Toughness")
    
    return

In [None]:
lr_sstype(SS_304,"SS304")

## Ridge Regression

In [None]:
def rr_sstype(scaled_df,name):
    # Creating Predictor variable 'X' and Target Variable 'y'
    # X contains all the features except for the target value Price
    X = scaled_df.drop('KJIC', axis = 1)
    y = scaled_df['KJIC']
    
    # Creating the training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=50)
    
    # X_train contains 70% of total dataset
    print("Training dataset:", X_train.shape)
    # X_test contains 30% of total dataset
    print("Test dataset:", X_test.shape)
    
    #Model Ridge Rigression
    rr = Ridge(alpha=0.01)
    rr.fit(X_train, y_train) 
    pred_train_rr= rr.predict(X_train)
    print("RMSE train:", np.sqrt(mean_squared_error(y_train,pred_train_rr)))
    print("r2 score for training: ", r2_score(y_train, pred_train_rr))
    
    pred_test_rr= rr.predict(X_test)
    print("RMSE test:", np.sqrt(mean_squared_error(y_test,pred_test_rr))) 
    print("r2 score for test: ", r2_score(y_test, pred_test_rr))
    
    plt.scatter(y_test,pred_test_rr)
    plt.plot([0, 1], [0, 1])
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title("Ridge Regression of "+ name)
    plt.xlabel("Fracture Toughness")
    plt.ylabel("Predicted Fracture Toughness")
    
    return

In [None]:
rr_sstype(SS_304,"SS304")

In [None]:
rr_sstype(SS_316,"SS316")

In [None]:
rr_sstype(SS_347,"SS347")

## Elastic net

In [None]:
def enet_sstype(scaled_df,name):
    # Creating Predictor variable 'X' and Target Variable 'y'
    # X contains all the features except for the target value Price
    X = scaled_df.drop('KJIC', axis = 1)
    y = scaled_df['KJIC']
    
    # Creating the training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=50)
    
    # X_train contains 70% of total dataset
    print("Training dataset:", X_train.shape)
    # X_test contains 30% of total dataset
    print("Test dataset:", X_test.shape)
    
    #Model Ridge Rigression
    model_enet = ElasticNet(alpha = 0.01)
    model_enet.fit(X_train, y_train) 
    pred_train_enet= model_enet.predict(X_train)
    print("RMSE train:", np.sqrt(mean_squared_error(y_train,pred_train_enet)))
    print("r2 score for training: ", r2_score(y_train, pred_train_enet))
    
    pred_test_enet= model_enet.predict(X_test)
    print("RMSE test:", np.sqrt(mean_squared_error(y_test,pred_test_enet)))
    print("r2 score for test: ", r2_score(y_test, pred_test_enet))
    
    plt.scatter(y_test,pred_test_enet)
    plt.plot([0, 1], [0, 1])
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title("Elastic Net Regression of "+ name)
    plt.xlabel("Fracture Toughness")
    plt.ylabel("Predicted Fracture Toughness")
    
    return

In [None]:
enet_sstype(SS_304,"SS304")

In [None]:
enet_sstype(SS_316,"SS316")

In [None]:
enet_sstype(SS_347,"SS347")

## Lasso 

In [None]:
def lasso_sstype(scaled_df,name, alpha):
    # Creating Predictor variable 'X' and Target Variable 'y'
    # X contains all the features except for the target value Price
    X = scaled_df.drop('KJIC', axis = 1)
    y = scaled_df['KJIC']
    
    # Creating the training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=50)
    
    print(name+", "+str(alpha))
    # X_train contains 70% of total dataset
    print("Training dataset:", X_train.shape)
    # X_test contains 30% of total dataset
    print("Test dataset:", X_test.shape)
    
    #Model LASSO Rigression
    model_lasso = Lasso(alpha=alpha)
    model_lasso.fit(X_train, y_train) 
    pred_train_lasso= model_lasso.predict(X_train)
    print()
    print("RMSE train:", np.sqrt(mean_squared_error(y_train,pred_train_lasso)))
    print("r2 score for train: ", r2_score(y_train, pred_train_lasso))
    
    pred_test_lasso= model_lasso.predict(X_test)
    rmse_test = np.sqrt(mean_squared_error(y_test,pred_test_lasso))
    r2_test = r2_score(y_test, pred_test_lasso)
    print("RMSE test:", np.sqrt(mean_squared_error(y_test,pred_test_lasso))) 
    print("r2 score for test: ", r2_score(y_test, pred_test_lasso))
    
    plt.figure()
    plt.scatter(y_test,pred_test_lasso)
    plt.plot([0, 1], [0, 1])
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title("LASSO Regression of "+ name+", "+str(alpha))
    plt.text(0.1, 0.90, 'RMSE: '+str(round(rmse_test,3)))
    plt.text(0.1, 0.82, '  R^2: '+str(round(r2_test,3)))
    plt.xlabel("Fracture Toughness")
    plt.ylabel("Predicted Fracture Toughness")
    
    plt.figure()
    classic = get_feature_importance("Lasso", model_lasso, X_train.columns)
    classic.plot.barh(figsize=(5,5), color=[sns.color_palette(palette='PuBu', n_colors=len(X_train.columns))], 
        legend=False, title=name+", "+str(alpha) )
    plt.xlabel('coefficients')
    
    return

In [None]:
for alpha in [0.01, 0.002, 0.001]:
    lasso_sstype(SS_304,"SS304", alpha)


In [None]:
for alpha in [0.01, 0.002, 0.001]:
    lasso_sstype(SS_316,"SS316", alpha)


In [None]:
for alpha in [0.01, 0.002, 0.001]:
    lasso_sstype(SS_347,"SS347", alpha)
