In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
cancer.keys()


In [None]:
print(cancer['DESCR'])


# PCA Visualization

As we've noticed before it is difficult to visualize high dimensional data, we can use PCA to find the first two principal components, and visualize the data in this new, two-dimensional space, with a single scatter-plot. Before we do this though, we'll need to scale our data so that each feature has a single unit variance.



In [None]:
df = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
df.head()
Y = cancer.target

In [None]:
from sklearn.preprocessing import StandardScaler


In [None]:
scaler = StandardScaler()
scaler.fit(df) #Rescalling all the values in the same scale
scaled_data = scaler.transform(df) #with transform all the values are converted to the same scale


In [None]:
#You can use the following command for data scalling
from sklearn.preprocessing import scale # For Data Scaling
scaled_data = scale(df)


PCA with Scikit Learn uses a very similar process to other preprocessing functions that come with SciKit Learn. We instantiate a PCA object, find the principal components using the fit method, then apply the rotation and dimensionality reduction by calling transform().

We can also specify how many components we want to keep when creating the PCA object, here it s 4.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)


In [None]:
pca.fit(scaled_data) # we need only our features, since PCA is an unsupervised learning techniques and it doesn't need lables to learn
# we will cluster data based on similarity and based on eigen value


In [None]:
x_pca = pca.transform(scaled_data)
#let s check the shape of our data before and after applying PCa

In [None]:
scaled_data.shape


In [None]:
x_pca.shape


In [None]:
scores_df = pd.DataFrame(x_pca, columns=['PC1', 'PC2', 'PC3','PC4'])
scores_df

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(x_pca[:,0],x_pca[:,1],c=cancer['target'],cmap='plasma')
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')

So, for so far, we did the following steps:

1- Standar Sclling


2- Apply PCA

3- convert our data from 30 dimension to 4 dimension.

Now we can apply Decision tree, k means ... on our new dataset.
What we did is part of data preprocessing.

# Interpreting the components

Unfortunately, with this great power of dimensionality reduction, comes the cost of being able to easily understand what these components represent.

The components correspond to combinations of the original features, the components themselves are stored as an attribute of the fitted PCA object:

In this numpy matrix array, each row represents a principal component, and each column relates back to the original features. we can visualize this relationship with a heatmap:



In [None]:
df_comp = pd.DataFrame(pca.components_,columns=cancer['feature_names'])


In [None]:

plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma',)

This heatmap and the color bar basically represent the correlation between the various feature and the principal component itself.



Hopefully this information is useful to you when dealing with high dimensional data!



In [None]:

import plotly.express as px

In [None]:
#let s see the explained variance
#it will tell us what is the contribution of each PC, ie, how much each pc contributes to the overall variance of the data set
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
explained_variance = np.insert(explained_variance, 0, 0) # to force the values to start from 0


In [None]:
cumulative_variance = np.cumsum(np.round(explained_variance, decimals=3)) # at each point of the graph show the cumultative variance


In [None]:
pc_df = pd.DataFrame(['','PC1', 'PC2', 'PC3','PC4'], columns=['PC'])
explained_variance_df = pd.DataFrame(explained_variance, columns=['Explained Variance'])
cumulative_variance_df = pd.DataFrame(cumulative_variance, columns=['Cumulative Variance'])

In [None]:
df_explained_variance = pd.concat([pc_df, explained_variance_df, cumulative_variance_df], axis=1)
df_explained_variance

In [None]:
fig = px.bar(df_explained_variance, 
             x='PC', y='Explained Variance',
             text='Explained Variance',
             width=800)

fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.show()