# PCA Analysis
This notebook is a first attempt at PCA analysis on our images.

In [None]:
import pandas as pd
import numpy as np
import json
import os
import pickle #save features
import shutil #save images
from tqdm import tqdm #progress bar
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from skimage import io
from skimage.transform import resize

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Authenticate.
from google.colab import auth
auth.authenticate_user()

# Install Cloud Storage FUSE.
!echo "deb https://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
!apt -qq update && apt -qq install gcsfuse

deb https://packages.cloud.google.com/apt gcsfuse-jammy main
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2659  100  2659    0     0  24569      0 --:--:-- --:--:-- --:--:-- 24394
OK
49 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mhttps://packages.cloud.google.com/apt/dists/gcsfuse-jammy/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.[0m
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)[0m
The following NEW packages will be installed:
  gcsfuse
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 11.0 MB of archives.
After this operation, 0 B of additional disk space will be use

In [None]:
# Mount a Cloud Storage bucket or location
mount_path = "281-project-d5d834b8-2d7c-11ef-91d5-b89a2a9d8518"
local_path = f"/mnt/gs/{mount_path}"

!mkdir -p {local_path}
!gcsfuse --implicit-dirs {mount_path} {local_path}

{"timestamp":{"seconds":1721507426,"nanos":474446082},"severity":"INFO","message":"Start gcsfuse/2.3.2 (Go version go1.22.4) for app \"\" using mount point: /mnt/gs/281-project-d5d834b8-2d7c-11ef-91d5-b89a2a9d8518\n"}
{"timestamp":{"seconds":1721507426,"nanos":474750584},"severity":"INFO","message":"GCSFuse mount command flags: {\"AppName\":\"\",\"Foreground\":false,\"ConfigFile\":\"\",\"MountOptions\":{},\"DirMode\":493,\"FileMode\":420,\"Uid\":-1,\"Gid\":-1,\"ImplicitDirs\":true,\"OnlyDir\":\"\",\"RenameDirLimit\":0,\"IgnoreInterrupts\":true,\"CustomEndpoint\":null,\"BillingProject\":\"\",\"KeyFile\":\"\",\"TokenUrl\":\"\",\"ReuseTokenFromUrl\":true,\"EgressBandwidthLimitBytesPerSecond\":-1,\"OpRateLimitHz\":-1,\"SequentialReadSizeMb\":200,\"AnonymousAccess\":false,\"MaxRetrySleep\":30000000000,\"StatCacheCapacity\":20460,\"StatCacheTTL\":60000000000,\"TypeCacheTTL\":60000000000,\"KernelListCacheTtlSeconds\":0,\"HttpClientTimeout\":0,\"MaxRetryDuration\":-1000000000,\"RetryMultiplier

In [None]:
#test mounting
os.listdir(local_path)

['features', 'preprocessed-data', 'raw-data', 'train_data_preprocessed']

In [None]:
#specifiy input image folder
input_path = os.path.join(local_path, 'train_data_preprocessed')

# Create output directory if it doesn't exist
output_path = os.path.join(local_path, 'features')
os.makedirs(output_path, exist_ok=True)

## PCA
### NOTE: need to run still was taking too long the first time I tried, will run in the morning

In [None]:
# standardize the data
scaler = StandardScaler()
# load all images into numpy array
images = []
for filename in os.listdir(input_path):
  img = io.imread(os.path.join(input_path, filename))
  if img is not None:
    images.append(img.flatten())

images_std = scaler.fit_transform(np.array(images))

In [None]:
# run PCA with retention of 95% of variance
pca = PCA(n_components=.95)
images_pca = pca.fit_transform(images_std)

print(f"Reduced dimensions to {images_pca.shape[1]} components.")

In [None]:
# visualize
plt.figure(figsize=(10,10))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance Explained')
plt.title('PCA Analysis')
plt.grid()
plt.show()