In [1]:
### dimensionality reduction with PCA
## input: array of feature vectors
## output: array of feature vectors

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import pickle



In [2]:
# import features

feature_path = r"/home/tschernn/clustering/features_vgg.pkl"

with open(feature_path, 'rb') as file:
    data = pickle.load(file)

data.shape

(155, 4096)

In [3]:
# to perform PCA we first adjust mean and variance using StandardScaler
Clus_dataSet = StandardScaler().fit_transform(data) # (mean = 0 and variance = 1)

# make an instance of the Model
# you can either choose a desired amount of retained variance, or decide the number of dimensions to reduce to yourself

## if you choose to set a desired variance:
# the higher the explained variance the more accurate the model will remain
variance = 0.98
pca = PCA(variance, random_state = 22)

## if you select the number of dimensions yourself
#dims = 2
#pca = PCA(dims, random_state = 22)

# fit the data according to PCA instance
pca.fit(Clus_dataSet)

#print(f'cumulative variance explained by {dims} principal components: {np.sum(pca.explained_variance_ratio_)}')
print(f'number of components after PCA {variance} = ' + str(pca.n_components_))

number of components after PCA 0.98 = 143


In [4]:
# transform our data according to our PCA instance
Clus_dataSet = pca.transform(Clus_dataSet)
print("Dimensions of our data after PCA  = " + str(Clus_dataSet.shape)) 

Dimensions of our data after PCA  = (155, 143)


In [5]:
feature_path = feature_path.split('.')[0]
feature_path = feature_path + '_dimred.pkl'

### export data as pickle file

with open(feature_path,'wb') as file:
    pickle.dump(Clus_dataSet, file)
    print(f'Saved features as {file.name}.')

Saved features as /home/tschernn/clustering/features_vgg_dimred.pkl.


In [6]:
# to visualise the data inversed from PCA
approximation = pca.inverse_transform(Clus_dataSet)
print("Dimensions of our data after inverse transforming the PCA  = " + str(approximation.shape))

Dimensions of our data after inverse transforming the PCA  = (155, 4096)
