# PCA using Scikit-Learn on Auto Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Step 1: Load and Clean Data
Same cleaning steps as the scratch version:
- Remove rows with missing horsepower
- Convert horsepower to numeric
- Drop 'name' column

In [2]:
Auto = pd.read_csv('Auto.csv')

# Remove rows with missing horsepower
Auto = Auto[Auto['horsepower'] != '?']

# Convert horsepower to numeric
Auto['horsepower'] = pd.to_numeric(Auto['horsepower'])

# Drop 'name' column
if 'name' in Auto.columns:
    Auto = Auto.drop(columns=['name'])

print("Cleaned Data Shape:", Auto.shape)
print(Auto.head())

Cleaned Data Shape: (392, 8)
    mpg  cylinders  displacement  horsepower  weight  acceleration  year  \
0  18.0          8         307.0         130    3504          12.0    70   
1  15.0          8         350.0         165    3693          11.5    70   
2  18.0          8         318.0         150    3436          11.0    70   
3  16.0          8         304.0         150    3433          12.0    70   
4  17.0          8         302.0         140    3449          10.5    70   

   origin  
0       1  
1       1  
2       1  
3       1  
4       1  


## Step 2: Standardize Data
Using `StandardScaler` to scale features to mean 0 and variance 1.

In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(Auto)

print("Mean of scaled data:\n", np.mean(X_scaled, axis=0))
print("Std of scaled data:\n", np.std(X_scaled, axis=0))

Mean of scaled data:
 [ 1.45008722e-16 -1.08756541e-16 -7.25043608e-17 -1.81260902e-16
 -1.81260902e-17  4.35026165e-16 -1.16006977e-15  1.35945676e-16]
Std of scaled data:
 [1. 1. 1. 1. 1. 1. 1. 1.]


## Step 3: Perform PCA
Using `sklearn.decomposition.PCA`.

In [4]:
pca = PCA()
pca.fit(X_scaled)

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


## Step 4: Results
Comparing these results with the scratch implementation.

**Note:** Eigenvectors in PCA are unique up to a sign flip. If you see opposite signs compared to your scratch implementation, it is still the same principal component.

In [5]:
print("Eigenvalues (Explained Variance):\n", pca.explained_variance_)
print("\nEigenvectors (Components):\n", pca.components_)
print("\nExplained Variance Ratio:\n", pca.explained_variance_ratio_)

Eigenvalues (Explained Variance):
 [5.38962134 0.94607672 0.81371946 0.4873993  0.18329415 0.11461431
 0.05368377 0.0320513 ]

Eigenvectors (Components):
 [[-0.38586239  0.4023885   0.41644435  0.40183594  0.40157579 -0.2647309
  -0.21386777 -0.27786815]
 [ 0.07663269  0.13842878  0.12632499 -0.11148007  0.21102     0.41690206
   0.6904632  -0.50150064]
 [-0.29228579 -0.07223935 -0.07423622 -0.23605571  0.00089399  0.63943514
  -0.5871892  -0.30732382]
 [-0.09998251  0.21603551  0.13581398  0.11971643  0.32246785  0.49280794
   0.10601968  0.74328281]
 [ 0.74036644  0.48261485  0.30331627 -0.08426839 -0.13127292  0.09773197
  -0.30134385 -0.04739508]
 [ 0.38735165 -0.53092548 -0.00699705  0.6667096   0.23585961  0.20293343
  -0.11002592 -0.12086663]
 [ 0.19588516 -0.27878265  0.08422855 -0.53504996  0.72202073 -0.22891382
  -0.12501506  0.0345266 ]
 [-0.1151321  -0.41774679  0.82916553 -0.13477548 -0.30991105  0.03518826
   0.0542884   0.07951102]]

Explained Variance Ratio:
 [0.671984