In [3]:
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from ast import literal_eval
import umap

In [41]:
doc_embeddings = pd.read_csv("Doc_Embeddings.csv")

In [42]:
doc_embeddings['embedding'] = doc_embeddings['embedding'].apply(literal_eval)


In [75]:
# Convert the 'embedding' column to a 2D numpy array
embeddings_array = np.array(doc_embeddings['embedding'].tolist())
embeddings_array.shape

(546, 384)

In [55]:
# Initialize PCA with the desired number of components (e.g., 2 for visualization purposes)
n_components = 10
pca = PCA(n_components=n_components)

# Fit PCA to your data
pca.fit(embeddings_array)

# Transform the data to the new low-dimensional space
embeddings_pca = pca.transform(embeddings_array)

In [57]:
embeddings_pca.shape

(546, 10)

In [59]:
# If you want to see the explained variance ratio of each component:
print("Explained variance ratio of each component:", pca.explained_variance_ratio_)
print(f"These {len(pca.explained_variance_ratio_)} components account for {round(sum(pca.explained_variance_ratio_)*100, 2)}% of the original variance of the dataset")

Explained variance ratio of each component: [0.09969506 0.06001461 0.03422296 0.03322587 0.02976361 0.02910926
 0.02354175 0.02168496 0.01995102 0.01744422]
These 10 components account for 36.87% of the original variance of the dataset


In [60]:
pca_df = pd.DataFrame(embeddings_pca, columns=['PC1', 'PC2', 'PC3','PC4', 'PC5', 'PC6','PC7', 'PC8', 'PC9', 'PC10'])
pca_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,0.421970,0.060598,-0.005837,0.252857,0.093542,0.258626,0.216162,0.193012,0.233460,-0.060679
1,0.396880,-0.008034,-0.001619,0.242710,0.107599,0.243641,0.241079,0.157295,0.152002,-0.034439
2,0.156880,0.390225,-0.046728,-0.071126,-0.016850,0.120469,-0.005960,-0.098720,-0.174771,-0.185665
3,0.225348,0.371690,-0.167144,-0.053522,-0.027385,0.073489,-0.078679,-0.125000,-0.052496,-0.231188
4,0.197406,0.136551,0.147796,0.054663,0.002721,0.282046,0.023183,0.072061,-0.115902,0.038635
...,...,...,...,...,...,...,...,...,...,...
541,0.069077,-0.017216,-0.058836,-0.210223,0.035152,-0.174377,-0.076611,-0.243222,0.074454,-0.217489
542,-0.101937,0.040628,-0.167586,-0.157418,0.038943,-0.055873,-0.099020,-0.146241,0.050466,-0.133843
543,0.044595,0.233074,-0.012257,-0.201202,-0.076471,-0.214881,-0.105018,-0.201356,-0.054250,-0.062819
544,0.073826,0.193906,0.142891,-0.243016,-0.147900,-0.247993,-0.004138,-0.096134,-0.038255,-0.115761


In [62]:
doc_embeddings_with_pca = pd.concat([doc_embeddings, pca_df], axis=1)
doc_embeddings_with_pca.shape

(546, 19)

In [63]:
doc_embeddings_with_pca.to_csv('Doc_Embeddings_with_PCA.csv')

In [77]:
reducer = umap.UMAP(n_components=2,  # Adjust n_components as needed
                    n_neighbors=15,  # Experiment with different values
                    min_dist=0.05,  # Experiment with different values
                    metric='euclidean')  # Consider other metrics if appropriate


In [81]:
umap_embeddings = reducer.fit_transform(embeddings_array)


In [82]:
umap_embeddings.shape

(546, 2)

In [83]:
umap_df = pd.DataFrame(umap_embeddings, columns=["UMAP1", "UMAP2"])

In [84]:
doc_embeddings_with_pca_and_umap = pd.concat([doc_embeddings_with_pca, umap_df], axis=1)
doc_embeddings_with_pca_and_umap.shape

(546, 21)

In [85]:
doc_embeddings_with_pca_and_umap.to_csv('Doc_Embeddings_with_PCA_and_UMAP.csv')

In [89]:
doc_embeddings_with_pca_and_umap["dist"].describe()

count    546.000000
mean       1.361697
std        0.103990
min        0.000000
25%        1.322944
50%        1.376098
75%        1.422511
max        1.514929
Name: dist, dtype: float64