In [1]:
##### This Jupyter notebook generates PCA CVs. 
##### Please ensure you have properly set up the conda environment with all libraries.
##### In this notebook, since the total number of features is 2, the number of the final CVs is set to 1.

##### Author: MO (latest update: May 28, 2024)

In [2]:
##### User Inputs ####
nDataPoints = 754 # Number of data points in each class (*note: each class should have the same number of data points)
num_eigenvector = 2 # Number of eigenvectors or CVs (reduced dimensionality)
descriptor_list = ['res159.439', 'res245.369', 'res64.137', 'res199.471', 'res78.450', 'res242.340', 'res77.293'] # List of feature names

In [3]:
### STEP 0. Import libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

In [4]:
### STEP 1. Load input data
df = pd.read_csv('mpso.csv')

In [5]:
### STEP 1. Zero-mean the data
np.set_printoptions(precision=8)
for elem in descriptor_list:
    print('Mean for ', elem, ': ', df[elem].mean())
    df[elem] = df[elem] - df[elem].mean()

Mean for  res159.439 :  21.74415728372237
Mean for  res245.369 :  38.09927344891246
Mean for  res64.137 :  16.775558394920427
Mean for  res199.471 :  30.131755684036246
Mean for  res78.450 :  13.889633035910256
Mean for  res242.340 :  26.819251453872678
Mean for  res77.293 :  40.03164760639258


In [6]:
### STEP 2. Separate data and generate labels
X = df.iloc[:,:len(descriptor_list)].values
X = X.astype(np.float64)
y = np.concatenate([np.zeros(nDataPoints),np.ones(nDataPoints),np.ones(nDataPoints)+1])
print(X)

[[-0.76525385  2.08614583  1.03468783 ...  0.23212689  1.08992405
   0.75515021]
 [-1.47115657  2.44363145  0.69819475 ...  2.78658885  1.9426788
   2.18380523]
 [-2.18411414  2.18471138 -0.08038096 ...  1.60488209  1.93621906
   1.18560777]
 ...
 [ 2.30423513 -2.98455501  2.50959053 ... -3.00694806 -2.71507559
  -1.41571489]
 [ 3.33669351 -3.31498275  2.7419514  ... -3.27287224 -3.22628615
  -1.51252031]
 [ 1.98100839 -3.30819255  2.2219749  ... -2.74602217 -3.0444609
  -1.75634772]]


In [7]:
### STEP 3. Perform PCA
pca = PCA(n_components=num_eigenvector)
pca_X = pca.fit_transform(X)
print('Shape before PCA: ', X.shape)
print('Shape after PCA: ', pca_X.shape)

pca_df = pd.DataFrame(data=pca_X, columns=['PC1', 'PC2'])
pca_df['class'] = y
pca_df.to_csv('PCA.csv', index=False)

Shape before PCA:  (2262, 7)
Shape after PCA:  (2262, 2)


In [8]:
### STEP 4. Calculate variances (eigenvalues) and CVs (eigenvectors)
print('Variances:', pca.explained_variance_)
print('Variance ratios:', pca.explained_variance_ratio_)
print('CVs:', pca.components_)

Variances: [24.80599747  6.69131804]
Variance ratios: [0.7148015  0.19281483]
CVs: [[-0.22499166  0.46269551 -0.3671284   0.32273212  0.52853859  0.27737808
   0.37424683]
 [ 0.16315402  0.03483583  0.74273121  0.38133827  0.43384173  0.13819161
  -0.26035195]]
