In [2]:
import pandas as pd
from pathlib import Path

from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [3]:
file_path = Path("modified_0002.pkl")

# Open the Parquet file
df = pd.read_pickle(file_path)

In [4]:
# Normalize the embeddings
normalized_embeddings = normalize(df['emb'].tolist())

In [5]:
# Dimensionality Reduction using PCA
pca_components = 256
pca = PCA(n_components=pca_components)

print(f"Original size of vector {normalized_embeddings.shape}")

# normalised embeddings are showing a type misamtch. Don't know how to fix it
PCA_reduced_embeddings = pca.fit_transform(normalized_embeddings)

print(f"Original size of vector {PCA_reduced_embeddings.shape}")

Original size of vector (253344, 1024)
Original size of vector (253344, 256)


In [6]:
# K Means Clustring 
num_clusters = 8  # Adjust the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(PCA_reduced_embeddings)

df['cluster'] = kmeans.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [7]:
df['cluster'].value_counts()

5    51034
0    46739
1    35165
2    33402
6    25844
7    23905
3    23049
4    14206
Name: cluster, dtype: int64

In [8]:
df.drop('emb', axis=1, inplace=True)
print(df.head())

  title  cluster
0     1        3
1     2        0
2     3        5
3     4        2
4     5        5


In [10]:
file_path = Path("modified_text_0002.pkl")

# Open the Parquet file
df_text = pd.read_pickle(file_path)
print(df_text.iloc[0])

title                                                    1
text     If you know the application which can open 000...
Name: 0, dtype: object


In [11]:
merged_df = pd.merge(df, df_text, on='title')

In [12]:
print(merged_df.iloc[0])

title                                                      1
cluster                                                    3
text       If you know the application which can open 000...
Name: 0, dtype: object


In [13]:
unique_clusters = merged_df['cluster'].unique()

# Create separate DataFrames for each cluster
cluster_dataframes = {}

for cluster in unique_clusters:
    cluster_dataframes[cluster] = merged_df[merged_df['cluster'] == cluster][['title', 'text']].reset_index(drop=True)

In [15]:
# # print(cluster_dataframes[7].head())
# for index, row in cluster_df.iterrows():
#     print(row['title'])
#     print(row['text'])
#     break

In [16]:
#Save new clusters as a trec files.
for cluster, cluster_df in cluster_dataframes.items():
    trec_filename = f'trecFiles/cluster_{cluster}_output.trec'

    with open(trec_filename, 'w', encoding='utf-8') as trec_file:
        for index, row in cluster_df.iterrows():
            # Write TREC format for each document
            trec_file.write(f"<DOC>\n")
            trec_file.write(f"<TEXT>\n{row['title']}\n")
            trec_file.write(f"{row['text']}\n</TEXT>\n")
            trec_file.write(f"</DOC>\n")

    print(f"TREC file for Cluster {cluster} created: {trec_filename}")

TREC file for Cluster 3 created: trecFiles/cluster_3_output.trec
TREC file for Cluster 0 created: trecFiles/cluster_0_output.trec
TREC file for Cluster 5 created: trecFiles/cluster_5_output.trec
TREC file for Cluster 2 created: trecFiles/cluster_2_output.trec
TREC file for Cluster 7 created: trecFiles/cluster_7_output.trec
TREC file for Cluster 1 created: trecFiles/cluster_1_output.trec
TREC file for Cluster 4 created: trecFiles/cluster_4_output.trec
TREC file for Cluster 6 created: trecFiles/cluster_6_output.trec
