In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

# Assuming your data is in a CSV file named 'USArrests.csv' in your Google Drive's 'My Drive' folder
data_path = '/content/drive/My Drive/USArrests.csv'
df = pd.read_csv(data_path)

# Now you can work with the df DataFrame containing your data
print(df.head())


Mounted at /content/drive
   Unnamed: 0  Murder  Assault  UrbanPop  Rape
0     Alabama    13.2      236        58  21.2
1      Alaska    10.0      263        48  44.5
2     Arizona     8.1      294        80  31.0
3    Arkansas     8.8      190        50  19.5
4  California     9.0      276        91  40.6


(a) Show the first to fourth principal components loading vectors using PCA() function

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Separate the features from the dataset
X = df.iloc[:, 1:]  # Assuming the first column is not a feature

# Standardize the features
x = StandardScaler().fit_transform(X)

# Perform PCA
pca = PCA(n_components=4)  # Keep the first 4 principal components
principalComponents = pca.fit_transform(x)

# Get the loading vectors (eigenvectors)
loading_vectors = pca.components_

# Print the loading vectors for the first to fourth principal components
print("First to Fourth Principal Component Loading Vectors:")
for i in range(4):
  print(f"Principal Component {i+1}: {loading_vectors[i]}")



First to Fourth Principal Component Loading Vectors:
Principal Component 1: [0.53589947 0.58318363 0.27819087 0.54343209]
Principal Component 2: [-0.41818087 -0.1879856   0.87280619  0.16731864]
Principal Component 3: [-0.34123273 -0.26814843 -0.37801579  0.81777791]
Principal Component 4: [-0.6492278   0.74340748 -0.13387773 -0.08902432]


(b) use np.linalg.eig() function to find the first to fourth principle component loading vectors

In [5]:
import numpy as np

# Calculate the covariance matrix
covariance_matrix = np.cov(x, rowvar=False)

# Calculate eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

# Sort eigenvalues and eigenvectors in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

# Print the first to fourth principal component loading vectors
print("\nFirst to Fourth Principal Component Loading Vectors (using np.linalg.eig()):")
for i in range(4):
  print(f"Principal Component {i+1}: {eigenvectors[:, i]}")



First to Fourth Principal Component Loading Vectors (using np.linalg.eig()):
Principal Component 1: [0.53589947 0.58318363 0.27819087 0.54343209]
Principal Component 2: [ 0.41818087  0.1879856  -0.87280619 -0.16731864]
Principal Component 3: [-0.34123273 -0.26814843 -0.37801579  0.81777791]
Principal Component 4: [ 0.6492278  -0.74340748  0.13387773  0.08902432]


(c) use np.linalg.svd() function to find the first to fourth principle component loading vectors

In [6]:
# Perform SVD
U, S, V = np.linalg.svd(x)

# The loading vectors are the rows of V (transpose)
loading_vectors_svd = V

# Print the first to fourth principal component loading vectors
print("\nFirst to Fourth Principal Component Loading Vectors (using np.linalg.svd()):")
for i in range(4):
  print(f"Principal Component {i+1}: {loading_vectors_svd[i]}")



First to Fourth Principal Component Loading Vectors (using np.linalg.svd()):
Principal Component 1: [-0.53589947 -0.58318363 -0.27819087 -0.54343209]
Principal Component 2: [-0.41818087 -0.1879856   0.87280619  0.16731864]
Principal Component 3: [ 0.34123273  0.26814843  0.37801579 -0.81777791]
Principal Component 4: [ 0.6492278  -0.74340748  0.13387773  0.08902432]


(d) Are those from (a), (b), and (c) exactly the same? Why or why not?

In [8]:
# Compare the loading vectors from (a), (b), and (c)
print("\nAre the loading vectors from (a), (b), and (c) exactly the same?\n")

def compare_loading_vectors(loading_vectors_a, loading_vectors_b, tolerance=1e-6):
  """Compares two sets of loading vectors."""
  for i in range(4):
    if not np.allclose(loading_vectors_a[i], loading_vectors_b[i], atol=tolerance):
      return False
  return True

if compare_loading_vectors(loading_vectors, eigenvectors.T[:4]):
  print("The loading vectors from (a) and (b) are the same.")
else:
  print("The loading vectors from (a) and (b) are not the same.")

if compare_loading_vectors(loading_vectors, loading_vectors_svd[:4]):
  print("The loading vectors from (a) and (c) are the same.")
else:
  print("The loading vectors from (a) and (c) are not the same.")



Are the loading vectors from (a), (b), and (c) exactly the same?

The loading vectors from (a) and (b) are not the same.
The loading vectors from (a) and (c) are not the same.


Why are the loading vectors from the different methods not always exactly the same?

* **Sign Ambiguity:** The eigenvectors obtained from `np.linalg.eig()` and `np.linalg.svd()`
can have opposite signs compared to the loading vectors from `PCA()`.  This is because the direction
of a principal component is arbitrary.

* **Numerical Precision:** The calculations might have small numerical differences due to the different algorithms used.