# PCA tutorial

### Import the required packages

In [None]:
import pandas as pds
import numpy as np
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import plotly.express as px

### Load the example dataset using pandas

In [None]:
lcMSData = pds.read_csv('./Data/Dementia_RPOS_XCMS.csv')

In [None]:
# Read the retention time and m/z value from feature names
featuresData = pds.DataFrame([(float(x.split('_')[0]), float(x.split('_')[1][:-3])) for x in lcMSData.columns[11:]], columns=['Rt', 'mz'])
featuresData['Rt'] = featuresData['Rt']/60
medianSpectrum = np.median(lcMSData.iloc[:, 11:].values, axis=0)

# Use log of median spectra as intensity value for the scatterplot
featuresData['Median'] = np.log(medianSpectrum + 1)
#featuresData['Median'] = medianSpectrum 

## PCA

We will start by fitting a PCA model with 4 components to the log transformed data matrix.

In [None]:
XDataMatrix = lcMSData.iloc[:, 11:]

logXDataMatrix = np.log(XDataMatrix + 1)

In [None]:
pcaModel = Pipeline(steps=[('uv', StandardScaler()), ('PCA', PCA(n_components=4))])
# Without scaling
# pcaModel = Pipeline(steps=[('PCA', PCA(n_components=4))])

# Fit the PCA model
pcaModel.fit(logXDataMatrix)

In [None]:
P_loadings = pcaModel['PCA'].components_
T_scores = pcaModel.transform(logXDataMatrix)

# Assemble a pandas data frame with the scores for each component and then combine with study variables
pcaResultsDFrame = pds.DataFrame(T_scores, columns=['PC' + str(x+1) for x in range(T_scores.shape[1])])
pcaResultsDFrame = pds.concat([lcMSData.loc[:, ['Subject ID', 'Sample ID', 'Age', 'Gender', 'Run Order', 'Acquisition batch']], pcaResultsDFrame], axis=1)

In [None]:
fig = px.scatter(pcaResultsDFrame, x="PC1", y="PC2", color="Gender", render_mode='webgl', 
                template='plotly_white')
fig.show()

### Plot model loadings _(p)_

In [None]:
LoadingsPlotFrame = pds.DataFrame(P_loadings.T, columns=['PC' + str(x+1) for x in range(P_loadings.shape[0])])
LoadingsPlotFrame = pds.concat([featuresData, LoadingsPlotFrame], axis=1)

In [None]:
fig = px.scatter(LoadingsPlotFrame, x="Rt", y="mz", color="PC1", render_mode='webgl', 
                color_continuous_scale='RdBu', color_continuous_midpoint=0,
                labels={"Rt": "Retention time (min)",
                        "mz": "m/z"}, 
                template='plotly_white')

fig.show()

### Choosing the number of components

In [None]:
pcaModel = Pipeline(steps=[('uv', StandardScaler()), ('PCA', PCA(n_components=10))])

# Fit the PCA model
pcaModel.fit(logXDataMatrix)

A scree plot is a plot of the variance explained by each component. 

For exploratory data analysis, the choice of components is not so critical.

In [None]:
# Assemble the variance explained info in a dataframe
ScreeDataFrame = pds.DataFrame(np.c_[pcaModel['PCA'].explained_variance_ratio_, 
                                     pcaModel['PCA'].explained_variance_ratio_.cumsum(), 
                                     np.arange(1, 11)], columns=['VarianceExplained', 'CumulativeVarianceExplained', 'Number of PCs'])

In [None]:
fig = px.bar(ScreeDataFrame, x='Number of PCs', y='VarianceExplained', template='plotly_white')
fig.show()

It's also common to plot the cumulative variance profile.

In [None]:
fig = px.bar(ScreeDataFrame, x='Number of PCs', y='CumulativeVarianceExplained', template='plotly_white')
fig.show()