In [114]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [115]:
np.random.seed(0)

# Generate random 3D data
n_samples = 100
X = np.random.rand(n_samples, 3) * 100  # Scale data

# Create a target variable (0 or 1)
y = np.random.randint(0, 2, size=n_samples)

# Create a Pandas DataFrame
df = pd.DataFrame(data=X, columns=['Feature 1', 'Feature 2', 'Feature 3'])
df['Target'] = y

In [116]:
# Create a 3D scatter plot using Plotly
fig = px.scatter_3d(df, x='Feature 1', y='Feature 2', z='Feature 3', color='Target',
                     title='3D Data Scatter Plot', labels={'Feature 1': 'X-axis', 'Feature 2': 'Y-axis', 'Feature 3': 'Z-axis'})
fig.show()

In [117]:
# Step 1: Scale the data using StandardScaler
scaler = StandardScaler()

X_scaled = scaler.fit_transform(df[['Feature 1', 'Feature 2', 'Feature 3']])

X_scaled_df = pd.DataFrame(X_scaled, columns=['Feature 1', 'Feature 2', 'Feature 3'])

# Compute means and standard deviations *after* scaling
means_after = X_scaled_df.mean(axis=0)
stds_after = X_scaled_df.std(axis=0)

print("Means after scaling:\n", means_after)
print("\nStandard deviations after scaling:\n", stds_after)

Means after scaling:
 Feature 1    3.042011e-16
Feature 2   -3.197442e-16
Feature 3    2.220446e-16
dtype: float64

Standard deviations after scaling:
 Feature 1    1.005038
Feature 2    1.005038
Feature 3    1.005038
dtype: float64


In [118]:
# Step 2: Calculate the covariance matrix
cov_matrix = np.cov(X_scaled, rowvar=False)
cov_matrix

array([[ 1.01010101,  0.00847357,  0.15071309],
       [ 0.00847357,  1.01010101, -0.09051442],
       [ 0.15071309, -0.09051442,  1.01010101]])

In [119]:
# Step 3: Calculate eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

print("Eigen Values:", eigenvalues)
print("Eigen Vectors:\n", eigenvectors)

Eigen Values: [0.83047325 1.0175772  1.18225258]
Eigen Vectors:
 [[-0.60477096 -0.51389624  0.60841001]
 [ 0.38094623 -0.85755137 -0.34566692]
 [ 0.69937977  0.02272218  0.714389  ]]


In [120]:
# Sort the eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1] 
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]

print("Sorted Eigen Values:", sorted_eigenvalues)
print("Sorted Eigen Vectors:\n", sorted_eigenvectors)

Sorted Eigen Values: [1.18225258 1.0175772  0.83047325]
Sorted Eigen Vectors:
 [[ 0.60841001 -0.51389624 -0.60477096]
 [-0.34566692 -0.85755137  0.38094623]
 [ 0.714389    0.02272218  0.69937977]]


In [121]:
# Step 4: Select the top 2 eigenvectors (principal components)
top_components = sorted_eigenvectors[:, :2]
top_components

array([[ 0.60841001, -0.51389624],
       [-0.34566692, -0.85755137],
       [ 0.714389  ,  0.02272218]])

In [122]:
# Step 5: Project the data onto the principal components
X_pca = X_scaled.dot(top_components)

In [None]:
# Create a 2D scatter plot using Plotly
df_pca = pd.DataFrame(data=X_pca, columns=['Principal Component 1', 'Principal Component 2'])
df_pca['Target'] = df['Target'] 

fig_2d = px.scatter(df_pca, x='Principal Component 1', y='Principal Component 2', color='Target',
                    title='2D Data After PCA', labels={'Principal Component 1': 'PC1', 'Principal Component 2': 'PC2'})
fig_2d.show()