Run PCA and plot regression on PC1

Plot feature importance and explained variance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster import hierarchy
from sklearn.preprocessing import MinMaxScaler
import os, glob, inspect, sys
import re
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import epri_mc_lib_2 as mc
from importlib import reload
reload(mc)


### Import data and merge replicates

In [None]:
merged = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/MERGE_FT_TEP_UT_on_ID.csv'),
                    index_col=0)
merged.index = merged.index.str.rstrip('-12345')
mean_df = merged.groupby('ID').mean()
# Get rid of cold work
cw_regex = re.compile("[0-9]+$") 
mean_df['cold_work'] = [str(re.search(cw_regex,x).group()) for x in mean_df.index]
mean_df.index = mean_df.index.str.rstrip('02468')
mean_df.index = mean_df.index.str.rstrip('-')


## Select columns of interest

In [None]:
mean_df = mean_df[["KJIC","MS_Avg","TEP_average","Beta_avg","IF_amp_2.25MHz","IF_amp_3.5MHz","BS_amp","cold_work"]].dropna().drop('A286', axis=0)
mean_df['log_MS_Avg'] = np.log(mean_df['MS_Avg'])
mean_df['log_beta_avg'] = np.log(mean_df['Beta_avg']) 
log_kjic = np.log(mean_df.KJIC)
mean_kjic = mean_df.KJIC
mean_df.drop(columns=['KJIC','MS_Avg','Beta_avg'], inplace=True)


## Scale data

In [None]:
scaled_df = mc.scale_general(mean_df, MinMaxScaler())[0]
scaled_kjic = mc.scale_general(pd.DataFrame(log_kjic), MinMaxScaler())[0]


### PCA

In [None]:
for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_dr.index = subset_dr.cold_work
    subset_dr.drop(columns=['cold_work'],inplace=True)
    subset_dr.rename(index={0.00:'0%',0.25:'20%',0.50:'40%',0.75:'60%',1.00:'80%'},inplace=True)
    pca = PCA(n_components=4, svd_solver='full')
    pca.fit(subset_dr)
    color_dict = { '0%':'red', '20%':'blue', '40%':'green', '60%':'orange', '80%':'pink' }
    mc.biplot(pca, subset_dr, 0, 1, "PCA biplot "+ind)    

### Comparing to features

In [None]:
colors=['blue','red','orange']
markers=['.','^','*']
i=0

for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_kjic = scaled_kjic[scaled_kjic.index==ind].copy()
    subset_dr.index = subset_dr.cold_work
    subset_dr.rename(index={0.00:'0%',0.25:'20%',0.50:'40%',0.75:'60%',1.00:'80%'},inplace=True)
    plt.plot(subset_dr.index,np.exp(subset_kjic), label=ind,c=colors[i],marker=markers[i])
    plt.xlabel('Cold Work')
    plt.ylabel('log KJIC')
    plt.legend()
    i+=1

In [None]:
colors=['blue','red','orange']
markers=['.','^','*']
i=0

for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_kjic = scaled_kjic[scaled_kjic.index==ind].copy()
    subset_dr.index = subset_dr.cold_work
    subset_dr.rename(index={0.00:'0%',0.25:'20%',0.50:'40%',0.75:'60%',1.00:'80%'},inplace=True)
    plt.plot(subset_dr.index,subset_kjic, label=ind,c=colors[i],marker=markers[i])
    plt.xlabel('Cold Work')
    plt.ylabel('log KJIC')
    plt.legend()
    i+=1

In [None]:
colors=['blue','red','orange']
markers=['.','^','*']
i=0
for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_kjic = scaled_kjic[scaled_kjic.index==ind].copy()
    subset_dr.index = subset_dr.cold_work
    subset_dr.rename(index={0.00:'0%',0.25:'20%',0.50:'40%',0.75:'60%',1.00:'80%'},inplace=True)
    plt.scatter(subset_dr.TEP_average,subset_kjic, label=ind,c=colors[i],marker=markers[i])
    plt.xlabel('TEP')
    plt.ylabel('log KJIC')
    plt.legend()
    i+=1

In [None]:
colors=['blue','red','orange']
markers=['.','^','*']
i=0
for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_kjic = scaled_kjic[scaled_kjic.index==ind].copy()
    subset_dr.index = subset_dr.cold_work
    subset_dr.rename(index={0.00:'0%',0.25:'20%',0.50:'40%',0.75:'60%',1.00:'80%'},inplace=True)
    plt.scatter(subset_dr['IF_amp_2.25MHz'],subset_kjic, label=ind,c=colors[i],marker=markers[i])
    plt.xlabel('IF_amp_2.25MHz')
    plt.ylabel('log KJIC')
    plt.legend()
    i+=1

### "Regression" models

At this point a regression is pointless but it would be a linear regression so instead I'll just plot it and you can eyeball it to see what a fit line would behave like. Well 304 and 316 look linear with TEP, MS, and IF contributing.

In [None]:
colors=['blue','red','orange']
markers=['.','^','*']
i=0
for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_kjic = scaled_kjic[scaled_kjic.index==ind].copy()
    true_Y = subset_kjic
    subset_dr.drop(columns=['cold_work'],inplace=True)
    pca = PCA(n_components=1, svd_solver='full')
    xs = pca.fit_transform(subset_dr)
    plt.scatter(xs, subset_kjic, label=ind,c=colors[i],marker=markers[i])
    plt.xlabel('PC1')
    plt.ylabel('log KJIC')
    plt.legend()
    i+=1


In [None]:
scaled_df

In [None]:
i=0
for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_kjic = scaled_kjic[scaled_kjic.index==ind].copy()
    true_Y = subset_kjic
    subset_dr.drop(columns=['cold_work'],inplace=True)
    pca = PCA(n_components=1, svd_solver='full')
    xs = pca.fit_transform(subset_dr)
    sns.regplot(xs, subset_kjic, scatter=True, fit_reg=True, color=colors[i], label=ind,marker=markers[i])
    plt.xlabel('PC1')
    plt.ylabel('log KJIC')
    plt.legend()
    i+=1


In [None]:
i=0
for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_kjic = mean_kjic[mean_kjic.index==ind].copy()
    true_Y = subset_kjic
    subset_dr.drop(columns=['cold_work'],inplace=True)
    pca = PCA(n_components=1, svd_solver='full')
    xs = pca.fit_transform(subset_dr)
    sns.regplot(xs, subset_kjic, scatter=True, fit_reg=True, color=colors[i], label=ind,marker=markers[i])
    plt.xlabel('PC1')
    plt.ylabel('KJIC')
    plt.ylim(0,225)
    plt.legend()
    i+=1


In [None]:
i=0
for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_kjic = scaled_kjic[scaled_kjic.index==ind].copy()
    true_Y = subset_kjic
    subset_dr.drop(columns=['cold_work'],inplace=True)
    pca = PCA(n_components=1, svd_solver='full')
    xs = pca.fit_transform(subset_dr)
    plt.figure()
    sns.regplot(xs, subset_kjic, scatter=True, fit_reg=True, color=colors[i], label=ind,marker=markers[i])
    plt.ylim(0,1)
    plt.xlabel('PC1')
    plt.ylabel('log KJIC')
    plt.legend()
    i+=1


In [None]:
i=0
for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_kjic = mean_kjic[mean_kjic.index==ind].copy()
    true_Y = subset_kjic
    subset_dr.drop(columns=['cold_work'],inplace=True)
    pca = PCA(n_components=1, svd_solver='full')
    xs = pca.fit_transform(subset_dr)
    plt.figure()
    sns.regplot(xs, subset_kjic, scatter=True, fit_reg=True, color=colors[i], label=ind,marker=markers[i])
    plt.ylim(0,225)
    plt.xlabel('PC1')
    plt.ylabel('KJIC')
    plt.legend()
    i+=1


### Feature importance

In [None]:
for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_dr.drop(columns=['cold_work'],inplace=True)
    pca = PCA(n_components=4, svd_solver='full')
    pca.fit(subset_dr)
    plt.figure(figsize=(5, 5))
    plt.bar(["PC1","PC2","PC3","PC4"],pca.explained_variance_ratio_, align='center', alpha=0.5, color="gray")
    plt.ylim(0,1)
    plt.title(ind)
    plt.ylabel("Explained variance")

In [None]:
for ind in scaled_df.index.unique():
    subset_dr = scaled_df[scaled_df.index==ind].copy()
    subset_dr.drop(columns=['cold_work'],inplace=True)
    pca = PCA(n_components=4, svd_solver='full')
    pca.fit(subset_dr)
    plt.figure(figsize=(6, 6))
    feature_importance = pd.DataFrame([x*abs(y) for x,y in zip(pca.explained_variance_ratio_, pca.components_)],columns=subset_dr.columns, index=["PC1","PC2","PC3","PC4"] )
    plot_feat_imp = feature_importance.transpose().sort_values('PC1', ascending=False).transpose()
    plot_feat_imp.plot(kind='barh', color=sns.color_palette('PuBu_r', 6, desat=0.9), width=0.6, figsize=(6,6))
    plt.xlabel('Feature importance (explained variance ratio)', fontsize = 12)
    plt.xlim(0,1)
    plt.title(ind)
    plt.ylabel("Explained variance")