In this notebook we create different PCA models (i.e. trained on different data / different function parameters) which we save and can later re-use

In [None]:
from preprocessing import *
import joblib
import matplotlib.pyplot as plt

In [None]:
save_path = 'models/pca'

In [None]:
# Preprocess data
X = load_and_flatten_hsi('../Data/HDF5_FILES/train', mask_dir='../Data/MASKS/train', apply_mask=True, individual_normalize=False, mask_method=1)
print(f"Data shape before PCA: {X.shape}")

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=0.995)    # Retain 99.5% of variance
X_pca = pca.fit_transform(X_scaled)

# Print results
explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
print("---------------------------------")
print(f"Number of components chosen: {pca.n_components_}")
print(f"Explained variance ratio: {explained_variance_ratio[-1]:.4f}")
print(f"Data shape after PCA: {X_pca.shape}")

# Save PCA model and scaler
joblib.dump(pca, os.path.join(save_path, 'first_pca_model.pkl'))    # TODO: Maybe change pkl to joblib
joblib.dump(scaler, os.path.join(save_path, 'first_scaler.pkl'))

In [None]:
# Compute loadings
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

# Plot PC loadings
_, wlens = LoadHSI('../Data/HDF5_FILES/train/FX10_07SEPT2023_1B1.hdf5', return_wlens=True)

plt.figure(figsize=(10, 6))
for i in range(pca.n_components_):
    plt.plot(wlens, loadings[:, i], label=f'PC {i+1}')

plt.xlabel('Wavelength (nm)')
plt.ylabel('Loading Value')
plt.title('Principal Component Loadings')
plt.legend()
plt.grid()
plt.show()