# Information Retrieval and Web Search
<p>
Course Project - Clustering documents to compress inverted index<br>
Giovanni Costa - 880892
</p>

In [1]:
import pandas as pd
import numpy as np
import pickle, multiprocessing
from scipy.sparse import save_npz, load_npz
from sklearn.cluster import DBSCAN, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_distances
from utils import parse_data_files, get_tfidf_repr, TSP_solver, random_search_silhouette
from Indexer import Indexer, EXIT_NUMBER_DOCS

input_path="input/"
output_path="output/"
CORE_NUM=multiprocessing.cpu_count()
np.random.seed(42)

## Data parsing and TF-IDF representation

In [None]:
df=parse_data_files()
print("Dataframe info:")
df.info()

In [None]:
sparse_docs, tf_idf_vocab=get_tfidf_repr(df)
print("TF-IDF info:")
print("Shape: ", sparse_docs.shape)
print("Size in MB: {:.3f} ".format(sparse_docs.data.nbytes/ (1024**2)))
save_npz(input_path+"sparse_tf-idf.npz", sparse_docs)
with open(input_path+"tf-idf_vocab.pkl", "wb") as file:
    pickle.dump(tf_idf_vocab, file)
df.to_parquet(input_path+"df.parquet")

## Clustering and hyperparameters tuning

In [2]:
sparse_docs_1=load_npz(input_path+'sparse_tf-idf.npz')
tf_idf_vocab=None
with open(input_path+'tf-idf_vocab.pkl', 'rb') as file:
    tf_idf_vocab=pickle.load(file)
df=pd.read_parquet(input_path+"df.parquet")

In [3]:
test=sparse_docs_1[:EXIT_NUMBER_DOCS,:]
sparse_docs=test
#sparse_doc=sparse_docs_1

In [4]:
""" sr_proj=SparseRandomProjection(eps=0.1, random_state=42)
sparse_docs_approx=sr_proj.fit_transform(sparse_docs)
print("Current shape: ", sparse_docs_approx.shape)
print("Current density ratio:", sparse_docs_approx.count_nonzero()/(sparse_docs_approx.shape[0]*sparse_docs_approx.shape[1]))
print("Previous shape: ", sparse_docs.shape)
print("Previous density ratio:", sparse_docs.count_nonzero()/(sparse_docs.shape[0]*sparse_docs.shape[1])) """;

In [5]:
#sparse_docs=sparse_docs_approx

### MiniBatch K-Means Method

In [6]:
n_iter=15
params_k_means={"n_clusters": [i for i in range(2, 101)]}
k_means_obj=MiniBatchKMeans(batch_size=256*CORE_NUM, n_init="auto") #For faster computations, you can set the batch_size greater than 256 * number of cores to enable parallelism on all cores
best_k_means=random_search_silhouette(k_means_obj, sparse_docs, params_k_means, n_iter)

100%|██████████| 15/15 [01:43<00:00,  6.87s/it]


Best parameters: {'n_clusters': 97}
Best score: 0.08549902


In [7]:
centroid_distances=cosine_distances(best_k_means.cluster_centers_) #kmeans.cluster_centers_[0] = centroid of cluster 0
k_means_tsp=TSP_solver(centroid_distances)

#Get the labels given to the centroid in order to get the best cluster transversal ordering
k_means_cluster_order=k_means_tsp

In [8]:
#s = pd.Series(k_means_cluster_order)
#s[s.duplicated()]

In [9]:
""" indices=np.nonzero(best_k_means.labels_==2)[0]
#doc_ids=df.iloc[indices]["doc_id"]
#dim=doc_ids.shape[0]
dim=indices.shape[0]
distances=cosine_distances(sparse_docs[indices], best_k_means.cluster_centers_[label].reshape(1,-1)).reshape(-1)
tmp_vals=dict(zip(np.argsort(distances), range(starting_val, starting_val+dim))) """;

' indices=np.nonzero(best_k_means.labels_==2)[0]\n#doc_ids=df.iloc[indices]["doc_id"]\n#dim=doc_ids.shape[0]\ndim=indices.shape[0]\ndistances=cosine_distances(sparse_docs[indices], best_k_means.cluster_centers_[label].reshape(1,-1)).reshape(-1)\ntmp_vals=dict(zip(np.argsort(distances), range(starting_val, starting_val+dim))) '

In [10]:
starting_val=0
k_means_docid_remapping={}
for label in k_means_cluster_order:
    indices=np.nonzero(best_k_means.labels_==label)[0]
    dim=indices.shape[0]
    if dim!=0: #some clusters might be empty 
        distances=cosine_distances(sparse_docs[indices], best_k_means.cluster_centers_[label].reshape(1,-1)).reshape(-1)
        tmp_vals=dict(zip(indices[np.argsort(distances)], range(starting_val, starting_val+dim)))
        k_means_docid_remapping.update(tmp_vals)
        starting_val+=dim

In [11]:
with open(input_path+"k_means_remapping.pkl", "wb") as file:
    pickle.dump(k_means_docid_remapping, file)

### DBSCAN method

In [12]:
n_iter=15
params_dbscan={"min_samples": [i for i in range(2, 21)],
                "eps": [i for i in np.arange(0.05, 3.05, 0.05)]}
dbscan_obj=DBSCAN(metric="cosine")
best_dbscan=random_search_silhouette(dbscan_obj, sparse_docs, params_dbscan, n_iter)

  7%|▋         | 1/15 [00:06<01:26,  6.15s/it]



 13%|█▎        | 2/15 [00:14<01:34,  7.26s/it]



 20%|██        | 3/15 [00:20<01:19,  6.66s/it]



 33%|███▎      | 5/15 [00:32<01:04,  6.47s/it]



 40%|████      | 6/15 [00:39<00:56,  6.33s/it]



 47%|████▋     | 7/15 [00:45<00:50,  6.27s/it]



 53%|█████▎    | 8/15 [00:51<00:43,  6.21s/it]



 60%|██████    | 9/15 [00:57<00:37,  6.17s/it]



 67%|██████▋   | 10/15 [01:03<00:30,  6.17s/it]



 73%|███████▎  | 11/15 [01:09<00:24,  6.19s/it]



 80%|████████  | 12/15 [01:16<00:18,  6.21s/it]



 87%|████████▋ | 13/15 [01:22<00:12,  6.22s/it]



 93%|█████████▎| 14/15 [01:28<00:06,  6.18s/it]



100%|██████████| 15/15 [01:34<00:00,  6.32s/it]

Best parameters: {'min_samples': 5, 'eps': 0.5}
Best score: -0.01783173





In [13]:
core_labels=best_dbscan.labels_[best_dbscan.core_sample_indices_]
core_index_list=[]
for label in np.unique(core_labels):
    indices=np.nonzero(core_labels==label)[0]
    label_indices=best_dbscan.core_sample_indices_[indices]
    index=np.random.choice(label_indices)
    core_index_list.append(index)
core_points=sparse_docs[core_index_list] #list of core points (one that represents one cluster)

In [14]:
core_points_distances=cosine_distances(core_points)
dbscan_tsp=TSP_solver(core_points_distances)

#Get the labels given to the core samples (representative elements) in order to get the best cluster transversal ordering
dbscan_cluster_order=dbscan_tsp+[-1] #add to the clusters also the outliers label

In [15]:
starting_val=0
dbscan_docid_remapping={}
for label in dbscan_cluster_order:
    indices=np.nonzero(best_dbscan.labels_==label)[0] #-1 is the noise
    dim=indices.shape[0]
    if dim!=0: #some clusters might be empty 
        distances=cosine_distances(sparse_docs[indices], core_points[label].reshape(1,-1)).reshape(-1)
        tmp_vals=dict(zip(indices[np.argsort(distances)], range(starting_val, starting_val+dim)))
        dbscan_docid_remapping.update(tmp_vals)
        starting_val+=dim

In [16]:
with open(input_path+"dbscan_remapping.pkl", "wb") as file:
    pickle.dump(dbscan_docid_remapping, file)

## Testing

In [17]:
indexer=Indexer()
inverted_index_standard=indexer.get_dict()
k_means_inverted_index=Indexer.remap_index(inverted_index_standard, k_means_docid_remapping)
dbscan_inverted_index=indexer.remap_index(inverted_index_standard, dbscan_docid_remapping)

 50%|█████     | 5/10 [00:02<00:02,  2.02it/s]


Total no. of terms (Voc. size): 35459
Total no. of tokens: 998681
Total no. of documents: 2377
Total no. of postings: 211806


In [18]:
#[np.diff(v[1]) for v in inverted_index_standard.values()]

In [19]:
#[np.diff(v[1]) for v in k_means_inverted_index.values()]

In [20]:
dim_inverted_index_standard = Indexer.get_total_VB_enc_size(inverted_index_standard)
dim_k_means_inverted_index = Indexer.get_total_VB_enc_size(k_means_inverted_index)
dim_dbscan_inverted_index = indexer.get_total_VB_enc_size(dbscan_inverted_index)
print(f"Standard inverted index dimension: {dim_inverted_index_standard} Bytes")

print(f"K Means method inverted index dimension: {dim_k_means_inverted_index} Bytes ~", end=" ")
print(round((dim_inverted_index_standard-dim_k_means_inverted_index)/(dim_inverted_index_standard+dim_k_means_inverted_index), 3)*100, "% reduction")

print(f"DBSCAN method inverted index dimension: {dim_dbscan_inverted_index} Bytes ~", end=" ")
print(round((dim_inverted_index_standard-dim_dbscan_inverted_index)/(dim_inverted_index_standard+dim_dbscan_inverted_index), 3)*100, "% reduction")

Standard inverted index dimension: 7329976.0 Bytes
K Means method inverted index dimension: 7455784.0 Bytes ~ -0.8999999999999999 % reduction
DBSCAN method inverted index dimension: 7489528.0 Bytes ~ -1.0999999999999999 % reduction
