<h2 align='center'>Principal Component Analysis Tutorial</h2>

In [22]:
import pandas as pd

In [23]:
df = pd.read_csv('digits.csv')

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[ 0.        , -0.33501649, -0.04308102, ..., -0.36677122,
        -1.14664746, -0.5056698 ],
       [ 0.        , -0.33501649, -1.09493684, ...,  0.84963214,
         0.54856067, -0.5056698 ],
       [ 0.        , -0.33501649, -1.09493684, ..., -0.16403733,
         1.56568555,  1.6951369 ],
       ...,
       [ 0.        , -0.33501649, -0.88456568, ...,  0.24143046,
        -0.12952258, -0.5056698 ],
       [ 0.        , -0.33501649, -0.67419451, ...,  0.84963214,
         0.8876023 , -0.5056698 ],
       [ 0.        , -0.33501649,  1.00877481, ...,  0.44416435,
         0.8876023 , -0.26113572]])

In [25]:
X = df.drop('pixel_7_7', axis=1)
y = df['pixel_7_7']

<h3>Use PCA to reduce dimensions</h3>

In [26]:
X

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_5,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,12.0,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,11.0,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,14.0,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,16.0,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,16.0,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,16.0,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,16.0,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0


In [27]:
from sklearn.decomposition import PCA

# Reduce PCA to exactly 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create a DataFrame for the first two components
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['Digit'] = y

# Plot using Plotly
import plotly.express as px

fig = px.scatter(
    pca_df, x='PC1', y='PC2',
    color='Digit',
    title='PCA: First Two Principal Components',
    labels={'PC1': 'First Principal Component', 'PC2': 'Second Principal Component'},
    color_continuous_scale='plasma'
)

fig.show()


<h4>Use components such that 95% of variance is retained</h4>

In [28]:
from sklearn.decomposition import PCA
import plotly.express as px
import pandas as pd

# Reduce PCA to exactly 3 components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# Create a DataFrame for the first three components
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2', 'PC3'])
pca_df['Digit'] = y

# Create a 3D scatter plot using Plotly
fig = px.scatter_3d(
    pca_df, x='PC1', y='PC2', z='PC3',
    color='Digit',
    title='PCA: First Three Principal Components',
    labels={'PC1': 'First Principal Component', 'PC2': 'Second Principal Component', 'PC3': 'Third Principal Component'},
    color_continuous_scale='plasma'
)

# Show the plot
fig.show()
