In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

In [2]:
file_path = Path("modified_0002.pkl")

# Open the Parquet file
df = pd.read_pickle(file_path)

In [3]:
# Normalize the embeddings
normalized_embeddings = normalize(df['emb'].tolist())

In [4]:
# Dimensionality Reduction using PCA
pca_components = 256
pca = PCA(n_components=pca_components)

# normalised embeddings are showing a type misamtch. Don't know how to fix it
PCA_reduced_embeddings = pca.fit_transform(normalized_embeddings)

print(f"Original size of vector {PCA_reduced_embeddings.shape}")


Original size of vector (253344, 256)


In [6]:
num_clusters = 8

# Fit a Gaussian Mixture Model with 8 components and enable verbose output
gmm = GaussianMixture(n_components=num_clusters, random_state=0, verbose=1)
gmm.fit(PCA_reduced_embeddings)

# Assign cluster labels to data points
cluster_labels = gmm.predict(PCA_reduced_embeddings)

# Add cluster labels to the DataFrame
df['cluster'] = cluster_labels

Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True


In [7]:
df['cluster'].value_counts()

6    49633
3    45758
0    38744
4    32854
2    29020
7    24622
5    19534
1    13179
Name: cluster, dtype: int64

In [10]:
df.drop('emb', axis=1, inplace=True)
print(df.head())

  title  cluster
0     1        7
1     2        0
2     3        3
3     4        5
4     5        3


In [8]:
file_path = Path("modified_text_0002.pkl")

# Open the Parquet file
df_text = pd.read_pickle(file_path)
print(df_text.iloc[0])

title                                                    1
text     If you know the application which can open 000...
Name: 0, dtype: object


In [9]:
merged_df = pd.merge(df, df_text, on='title')

In [10]:
print(merged_df.iloc[0])

title                                                      1
emb        [0.055311203, -0.17642975, 0.20465851, -0.0252...
cluster                                                    7
text       If you know the application which can open 000...
Name: 0, dtype: object


In [11]:
unique_clusters = merged_df['cluster'].unique()

# Create separate DataFrames for each cluster
cluster_dataframes = {}

for cluster in unique_clusters:
    cluster_dataframes[cluster] = merged_df[merged_df['cluster'] == cluster][['title', 'text']].reset_index(drop=True)

In [12]:
# # print(cluster_dataframes[7].head())
# for index, row in cluster_df.iterrows():
#     print(row['title'])
#     print(row['text'])
#     break

In [13]:
#Save new clusters as a trec files.
for cluster, cluster_df in cluster_dataframes.items():
    trec_filename = f'trecFiles/cluster_{cluster}_output.trec'

    with open(trec_filename, 'w', encoding='utf-8') as trec_file:
        for index, row in cluster_df.iterrows():
            # Write TREC format for each document
            trec_file.write(f"<DOC>\n")
            trec_file.write(f"<TEXT>\n{row['title']}\n")
            trec_file.write(f"{row['text']}\n</TEXT>\n")
            trec_file.write(f"</DOC>\n")

    print(f"TREC file for Cluster {cluster} created: {trec_filename}")

TREC file for Cluster 7 created: trecFiles/cluster_7_output.trec
TREC file for Cluster 4 created: trecFiles/cluster_4_output.trec
TREC file for Cluster 3 created: trecFiles/cluster_3_output.trec
TREC file for Cluster 0 created: trecFiles/cluster_0_output.trec
TREC file for Cluster 6 created: trecFiles/cluster_6_output.trec
TREC file for Cluster 5 created: trecFiles/cluster_5_output.trec
TREC file for Cluster 2 created: trecFiles/cluster_2_output.trec
TREC file for Cluster 1 created: trecFiles/cluster_1_output.trec
