# Information Retrieval and Web Search
<p>
Course Project - Clustering documents to compress inverted index<br>
Giovanni Costa - 880892
</p>

In [None]:
import pandas as pd
import numpy as np
import pickle, multiprocessing
from scipy.sparse import save_npz, load_npz
from sklearn.cluster import DBSCAN, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_distances
from utils import parse_data_files, get_tfidf_repr, TSP_solver, random_search_silhouette
from Indexer import Indexer, EXIT_NUMBER_DOCS

input_path="input/"
output_path="output/"
CORE_NUM=multiprocessing.cpu_count()
np.random.seed(42)

## Data parsing and TF-IDF representation

In [None]:
df=parse_data_files()
print("Dataframe info:")
df.info()

In [None]:
sparse_docs, tf_idf_vocab=get_tfidf_repr(df.iloc[:EXIT_NUMBER_DOCS])
print("TF-IDF info:")
print("Shape: ", sparse_docs.shape)
print("Size in MB: {:.3f} ".format(sparse_docs.data.nbytes/ (1024**2)))
save_npz(input_path+"sparse_tf-idf.npz", sparse_docs)
with open(input_path+"tf-idf_vocab.pkl", "wb") as file:
    pickle.dump(tf_idf_vocab, file)
df.to_parquet(input_path+"df.parquet")

## Clustering and hyperparameters tuning

In [None]:
sparse_docs_1=load_npz(input_path+'sparse_tf-idf.npz')
tf_idf_vocab=None
with open(input_path+'tf-idf_vocab.pkl', 'rb') as file:
    tf_idf_vocab=pickle.load(file)
df=pd.read_parquet(input_path+"df.parquet")

In [None]:
test=sparse_docs_1#[:EXIT_NUMBER_DOCS,:]
sparse_docs=test
#sparse_doc=sparse_docs_1

In [None]:
trunc_svd=TruncatedSVD(n_components=100, random_state=42) #For LSA, a value of 100 is recommended.
sparse_docs_approx=trunc_svd.fit_transform(sparse_docs)
print("Current shape: ", sparse_docs_approx.shape)
#print("Current density ratio:", sparse_docs_approx.count_nonzero()/(sparse_docs_approx.shape[0]*sparse_docs_approx.shape[1]))
#print("Previous shape: ", sparse_docs.shape)
#print("Previous density ratio:", sparse_docs.count_nonzero()/(sparse_docs.shape[0]*sparse_docs.shape[1]))

In [None]:
sparse_docs=sparse_docs_approx

### MiniBatch K-Means Method

In [None]:
n_iter=15
params_k_means={"n_clusters": [i for i in range(2, 101)]}
k_means_obj=MiniBatchKMeans(batch_size=256*CORE_NUM, n_init="auto") #For faster computations, you can set the batch_size greater than 256 * number of cores to enable parallelism on all cores
best_k_means=random_search_silhouette(k_means_obj, sparse_docs, params_k_means, n_iter)

In [None]:
centroid_distances=cosine_distances(best_k_means.cluster_centers_) #kmeans.cluster_centers_[0] = centroid of cluster 0
k_means_tsp=TSP_solver(centroid_distances)

#Get the labels given to the centroid in order to get the best cluster transversal ordering
k_means_cluster_order=k_means_tsp

In [None]:
#s = pd.Series(k_means_cluster_order)
#s[s.duplicated()]

In [None]:
starting_val=0
k_means_docid_remapping={}
for label in k_means_cluster_order:
    indices=np.nonzero(best_k_means.labels_==label)[0]
    dim=indices.shape[0]
    if dim!=0: #some clusters might be empty 
        distances=cosine_distances(sparse_docs[indices], best_k_means.cluster_centers_[label].reshape(1,-1)).reshape(-1)
        tmp_vals=dict(zip(indices[np.argsort(distances)], range(starting_val, starting_val+dim)))
        k_means_docid_remapping.update(tmp_vals)
        starting_val+=dim
    else:
        print(f"Cluster {label} is empty")

In [None]:
with open(input_path+"k_means_remapping.pkl", "wb") as file:
    pickle.dump(k_means_docid_remapping, file)

### DBSCAN method

In [None]:
n_iter=15
params_dbscan={"min_samples": [i for i in range(2, 21)],
                "eps": [i for i in np.arange(0.05, 3.05, 0.05)]}
dbscan_obj=DBSCAN(metric="cosine")
best_dbscan=random_search_silhouette(dbscan_obj, sparse_docs, params_dbscan, n_iter)

In [None]:
core_labels=best_dbscan.labels_[best_dbscan.core_sample_indices_]
core_index_list=[]
for label in np.unique(core_labels):
    indices=np.nonzero(core_labels==label)[0]
    label_indices=best_dbscan.core_sample_indices_[indices]
    index=np.random.choice(label_indices)
    core_index_list.append(index)
core_points=sparse_docs[core_index_list] #list of core points (one that represents one cluster)

In [None]:
core_points_distances=cosine_distances(core_points)
dbscan_tsp=TSP_solver(core_points_distances)

#Get the labels given to the core samples (representative elements) in order to get the best cluster transversal ordering
dbscan_cluster_order=dbscan_tsp+[-1] #add to the clusters also the outliers label

In [None]:
starting_val=0
dbscan_docid_remapping={}
for label in dbscan_cluster_order:
    indices=np.nonzero(best_dbscan.labels_==label)[0] #-1 is the noise
    dim=indices.shape[0]
    if dim!=0: #some clusters might be empty 
        distances=cosine_distances(sparse_docs[indices], core_points[label].reshape(1,-1)).reshape(-1)
        tmp_vals=dict(zip(indices[np.argsort(distances)], range(starting_val, starting_val+dim)))
        dbscan_docid_remapping.update(tmp_vals)
        starting_val+=dim
    else:
        print(f"Cluster {label} is empty")
        

In [None]:
with open(input_path+"dbscan_remapping.pkl", "wb") as file:
    pickle.dump(dbscan_docid_remapping, file)

## Testing

In [None]:
indexer=Indexer()
inverted_index_standard=indexer.get_dict()
k_means_inverted_index=Indexer.remap_index(inverted_index_standard, k_means_docid_remapping)
dbscan_inverted_index=indexer.remap_index(inverted_index_standard, dbscan_docid_remapping)

In [None]:
#[np.diff(v[1]) for v in inverted_index_standard.values()]

In [None]:
#[np.diff(v[1]) for v in k_means_inverted_index.values()]

In [None]:
dim_inverted_index_standard = Indexer.get_total_VB_enc_size(inverted_index_standard)
dim_k_means_inverted_index = Indexer.get_total_VB_enc_size(k_means_inverted_index)
dim_dbscan_inverted_index = indexer.get_total_VB_enc_size(dbscan_inverted_index)
print(f"Standard inverted index dimension: {dim_inverted_index_standard} Bytes")

print(f"K Means method inverted index dimension: {dim_k_means_inverted_index} Bytes ~", end=" ")
print(round((dim_inverted_index_standard-dim_k_means_inverted_index)/(dim_inverted_index_standard+dim_k_means_inverted_index), 3)*100, "% reduction")

print(f"DBSCAN method inverted index dimension: {dim_dbscan_inverted_index} Bytes ~", end=" ")
print(round((dim_inverted_index_standard-dim_dbscan_inverted_index)/(dim_inverted_index_standard+dim_dbscan_inverted_index), 3)*100, "% reduction")