## **Description** 

This notebook runs a PCA on the calibration window from the standardized waveforms to get a 2D visualization. This revealed a lack of separation between unsuccessful and ECDs. The process was repeated for un-standardized, and normalized, as well as the other windows, all yielding the same results. 

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import altair as alt

In [None]:
un_ts = pd.read_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/un_cal.csv')
ecd_ts = pd.read_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/ecd_cal.csv')

In [None]:
pcids = ecd_ts['TestId']
unids = un_ts['TestId']

In [None]:
everything = pd.concat([un_ts, ecd_ts]).drop('TestId', axis = 1)

In [None]:
cmap = {"pc":"red", "un":"blue"}
ecd_lab = pd.Series(['pc'])
un_lab = pd.Series(['un'])
x = un_lab.repeat(len(un_ts))
y = ecd_lab.repeat(len(ecd_ts))
labs = pd.concat([x, y])
everything.reset_index(drop = True, inplace = True)
#everything['labels'] = labs.reset_index(drop = True)

In [None]:
x = StandardScaler().fit_transform(everything)
pca = PCA(n_components=2)
pca.fit(x)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
principalComponents = pca.fit_transform(StandardScaler().fit_transform(everything))
principalDf = pd.DataFrame(data = principalComponents, columns = ['Component '+ str(i+1) for i in range(pca.n_components_)])

In [None]:
principalDf['label'] = labs.reset_index(drop=True)
principalDf

In [None]:
finalDf = principalDf
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = ['un', 'pc']
colors = ['b', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['label'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'Component 1']
               , finalDf.loc[indicesToKeep, 'Component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

In [None]:
# The matrix of variable loadings (i.e., matrix whose columns contain the eigenvectors)
# The eigenvectos provide the coefficients for the linear combo
rotation = pd.DataFrame(pca.components_, columns = everything.columns).T
rotation.columns = [f"ecd_{i}" for i in range(1, len(pca.components_) + 1)]
rotation
rotation.sort_values(by=['ecd_2'], key = abs, ascending = False).head(30)[['ecd_2']]

In [None]:
plt.plot(np.arange(-15, -3, 0.2),rotation['ecd_1'], label = 'Eigenvector 1')
plt.plot(np.arange(-15,-3,0.2),rotation['ecd_2'], label = 'Eigenvector 2')
plt.legend()

# Run PCA on transpose to get components that represent the majority of variance at each time point

In [None]:
x = ecd_ts.iloc[:,1:].transpose()
x = StandardScaler().fit_transform(x)
pca = PCA()
PrincipalComponents = pca.fit_transform(x)

In [None]:
pcadf = pd.DataFrame(data = PrincipalComponents, columns = ['Component '+ str(i+1) for i in range(pca.n_components_)])

In [None]:
fig, axs = plt.subplots(1, len(pcadf.columns)-1, figsize=((len(pcadf.columns)-1)*2, 2))
for i in range(len(pcadf.columns)-1):
    axs[i].plot(pcadf.iloc[:,i])

In [None]:
x = un_ts.iloc[:,1:].transpose()
pca = PCA()
PrincipalComponents = pca.fit_transform(x)

In [None]:
pcadf = pd.DataFrame(data = PrincipalComponents, columns = ['Component '+ str(i+1) for i in range(pca.n_components_)])

In [None]:
fig, axs = plt.subplots(1, len(pcadf.columns)-1, figsize=((len(pcadf.columns)-1)*2, 2))
for i in range(len(pcadf.columns)-1):
    axs[i].plot(pcadf.iloc[:,i])