# Information Retrieval and Web Search
<p>
Course Project - Clustering documents to compress inverted index<br>
Giovanni Costa - 880892
</p>

In [None]:
import pandas as pd
import numpy as np
import pickle, multiprocessing
from scipy.sparse import save_npz, load_npz
from sklearn.cluster import DBSCAN, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_distances
from utils import parse_data_files, get_tfidf_repr, TSP_solver, random_search_silhouette

input_path="input/"
output_path="output/"
CORE_NUM=multiprocessing.cpu_count()
np.random.seed(42)

## Data parsing and TF-IDF representation

In [None]:
df=parse_data_files()
print("Dataframe info:")
df.info()

In [None]:
sparse_docs, tf_idf_vocab=get_tfidf_repr(df)
print("TF-IDF info:")
print("Shape: ", sparse_docs.shape)
print("Size in MB: {:.3f} ".format(sparse_docs.data.nbytes/ (1024**2)))
save_npz(input_path+"sparse_tf-idf.npz", sparse_docs)
with open(input_path+"tf-idf_vocab.pkl", "wb") as file:
    pickle.dump(tf_idf_vocab, file)
df.to_parquet(input_path+"df.parquet")

## Clustering and hyperparameters tuning

In [None]:
sparse_docs_1=load_npz(input_path+'sparse_tf-idf.npz')
tf_idf_vocab=None
with open(input_path+'tf-idf_vocab.pkl', 'rb') as file:
    tf_idf_vocab=pickle.load(file)
df=pd.read_parquet(input_path+"df.parquet")

In [None]:
test=sparse_docs_1[:100,:]
sparse_docs=test

### MiniBatch K-Means Method

In [None]:
n_iter=10
params_k_means={"n_clusters": [i for i in range(2, 36)]}
k_means_obj=MiniBatchKMeans(batch_size=256*CORE_NUM, n_init="auto") #For faster computations, you can set the batch_size greater than 256 * number of cores to enable parallelism on all cores
best_k_means=random_search_silhouette(k_means_obj, sparse_docs, params_k_means, n_iter)

In [None]:
distances=cosine_distances(best_k_means.cluster_centers_, sparse_docs)
min_indices=np.argmin(distances, axis=1)
medoids=sparse_docs[min_indices]
medoids_distances=cosine_distances(medoids)
k_means_tsp=TSP_solver(medoids_distances)

#Get the labels given to the medoids (representative elements) in order to get the best cluster transversal ordering
k_means_cluster_order=best_k_means.labels_[min_indices[k_means_tsp]]

In [None]:
starting_val=0
docid_remaping={}
for label in k_means_cluster_order:
    indices=np.nonzero(best_k_means.labels_==label) #-1 is the noise
    doc_ids=df.iloc[indices]["doc_id"]
    dim=doc_ids.shape[0]
    tmp_vals=dict(zip(doc_ids, range(starting_val, starting_val+dim)))
    docid_remaping.update(tmp_vals)
    starting_val+=dim

In [None]:
with open(input_path+"k_means_remapping.pkl", "wb") as file:
    pickle.dump(docid_remaping, file)

### DBSCAN method

In [None]:
n_iter=10
params_dbscan={"min_samples": [i for i in range(2, 36)],
                "eps": [i for i in np.arange(0.1, 5.1, 0.1)]}
dbscan_obj=DBSCAN(metric="cosine")
best_dbscan=random_search_silhouette(dbscan_obj, sparse_docs, params_dbscan, n_iter)

In [None]:
core_indices=best_dbscan.core_sample_indices_
core_points=sparse_docs[core_indices]
core_points_distances=cosine_distances(core_points)
dbscan_tsp=TSP_solver(core_points_distances)

#Get the labels given to the core samples (representative elements) in order to get the best cluster transversal ordering
dbscan_cluster_order=best_dbscan.labels_[core_indices[dbscan_tsp]]

In [None]:
starting_val=0
docid_remaping={}
for label in dbscan_cluster_order:
    indices=np.nonzero(best_dbscan.labels_==label) #-1 is the noise
    doc_ids=df.iloc[indices]["doc_id"]
    dim=doc_ids.shape[0]
    tmp_vals=dict(zip(doc_ids, range(starting_val, starting_val+dim)))
    docid_remaping.update(tmp_vals)
    starting_val+=dim

In [None]:
with open(input_path+"dbscan_remapping.pkl", "wb") as file:
    pickle.dump(docid_remaping, file)