# PCA - Prinicpal Component Analysis

In [None]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid', context='notebook', rc={'figure.figsize':(14,10)})

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
penguins = pd.read_csv(
    "https://raw.githubusercontent.com/allisonhorst/palmerpenguins/c19a904462482430170bfe2c718775ddb7dbb885/inst/extdata/penguins.csv")

penguins

In [None]:
# Creating a pairwise scatterplots of the variables

sns.pairplot(data=penguins.drop('year', axis=1), hue='species')

In [None]:
penguins = penguins.dropna() # Removing NaN values
penguins.species.value_counts()

In [None]:
data = penguins[ # Choosing only numerical features
    [
        "bill_length_mm",
        "bill_depth_mm",
        "flipper_length_mm",
        "body_mass_g",
    ]
]
scaled_penguin_data = StandardScaler().fit_transform(data)
scaled_penguin_data.shape

In [None]:
pca = PCA() #Initializing the class
pca.fit(scaled_penguin_data) 

In [None]:
pca.components_

In [None]:
np.round(pca.components_@pca.components_.T)


In [None]:
pca.singular_values_

In [None]:
sns.barplot(
    y = pca.explained_variance_ratio_,
            x = [f'Component {i}' for i in range(data.shape[1])]
)
plt.title('Explained variance of the components', size=18)
plt.xlabel('Components', size=15)
plt.ylabel('Proportion of Explained Variance', size=15)

In [None]:
#Transforming data to lower dimensions
transformed_penguins_data = pca.transform(scaled_penguin_data)
transformed_penguins_data[:5]

In [None]:
# It is the same as multiplying with the components
scaled_penguin_data[:5]@pca.components_.T

In [None]:
# Creating a dataframe for plotting
transformed_penguins_df = pd.DataFrame(data = transformed_penguins_data, columns=[f'Component_{i}' for i in range(data.shape[1])])
transformed_penguins_df['species'] = penguins.species
transformed_penguins_df.sample(5)

In [None]:
# PCA Score Plot

fig, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(
    data = transformed_penguins_df,
    x='Component_0',
    y='Component_1',
    hue = 'species',
    s=200
)
ax.set_xlabel(f"PC1 - {pca.explained_variance_ratio_[0]*100:.2f}%")
ax.set_ylabel(f"PC2 - {pca.explained_variance_ratio_[1]*100:.2f}%")
ax.set_title("PCA Score Plot")

In [None]:
# PCA Biplot

# Gives information on where particular features are concentrated

loadings = pca.components_
pc1 = loadings[0]
pc2 = loadings[1]

for i, feature in enumerate(data.columns):
    ax.arrow(x=0,y=0, dx=pc1[i], dy=pc2[i], color='r',
             capstyle="projecting",head_width=0.03, 
             head_length=0.03)
    ax.text(pc1[i], pc2[i], feature)
    
fig

# Microarray Data

In [None]:
df = pd.read_csv("http://bioinf.ucd.ie/people/aedin/R/full_datasets/khan_train.csv", index_col=0)

In [None]:
df = df.T.reset_index()
df

In [None]:
df.columns = ["Tumor"] + [f"Gene_{i}" for i in range(1,2309)]

In [None]:
df["Tumor"] = [val.split(".")[0] for val in df['Tumor']]
df

In [None]:
scale_df = StandardScaler().fit_transform(df[df.columns[1:]])
scale_df

In [None]:
pca_microarray = PCA(n_components=50)
transformed_data = pca_microarray.fit_transform(scale_df)

In [None]:
sns.barplot(
    y = pca_microarray.explained_variance_ratio_,
            x = [f'{i}' for i in range(50)]
)
plt.title('Explained variance of the components', size=18)
plt.xlabel('Components', size=15)
plt.ylabel('Proportion of Explained Variance', size=15)

In [None]:
transformed_microarray_df = pd.DataFrame(data = transformed_data, columns=[f'Component_{i}' for i in range(50)])
transformed_microarray_df['Tumor'] = df['Tumor']
transformed_microarray_df.sample(5)

In [None]:
# PCA Score Plot

fig, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(
    data = transformed_microarray_df,
    x='Component_0',
    y='Component_1',
    hue = 'Tumor',
    s=200
)
ax.set_xlabel(f"PC1 - {pca_microarray.explained_variance_ratio_[0]*100:.2f}%")
ax.set_ylabel(f"PC2 - {pca_microarray.explained_variance_ratio_[1]*100:.2f}%")
ax.set_title("PCA Score Plot")