# Information Retrieval and Web Search
<p>
Course Project - Clustering documents to compress inverted index<br>
Giovanni Costa - 880892
</p>

In [1]:
import pandas as pd
import numpy as np
import pickle
import multiprocessing
from scipy.sparse import save_npz, load_npz, vstack
from sklearn.cluster import DBSCAN, MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import RandomizedSearchCV
from utils import parse_data_files, get_tfidf_repr, TSP_solver, stream_cluster, sort_csr_by_nonzero

input_path="input/"
output_path="output/"
CORE_NUM=multiprocessing.cpu_count()

## Data parsing and TF-IDF representation

In [None]:
df=parse_data_files()
print("Dataframe info:")
df.info()

In [None]:
sparse_docs, tf_idf_vocab=get_tfidf_repr(df)
print("TF-IDF info:")
print("Shape: ", sparse_docs.shape)
print("Size in MB: {:.3f} ".format(sparse_docs.data.nbytes/ (1024**2)))
save_npz(input_path+"sparse_tf-idf.npz", sparse_docs)
with open(input_path+"tf-idf_vocab.pkl", "wb") as file:
    pickle.dump(tf_idf_vocab, file)

## Clustering and hyperparameters tuning

In [2]:
sparse_docs_1=load_npz(input_path+'sparse_tf-idf.npz')
tf_idf_vocab=None
with open(input_path+'tf-idf_vocab.pkl', 'rb') as file:
    tf_idf_vocab=pickle.load(file)

In [3]:
test=sparse_docs_1[:10,:]
sparse_docs=test

In [None]:
""" C=stream_cluster(sorted_collection,  0.1)
elems=np.array([])
labels=[]
for label, cluster in enumerate(C):
    tmp_len=len(cluster)
    elems=np.concatenate([elems,cluster], axis=0)
    labels+=[label]*tmp_len
elems=vstack([e for e in elems]) """

In [8]:
k_means=MiniBatchKMeans(n_clusters=2, batch_size=256*CORE_NUM).fit(sparse_docs)
#get the total num of core of the cpu
dist=cosine_distances(k_means.cluster_centers_, sparse_docs)
dist



array([[1.        , 0.9914131 , 1.        , 1.        , 1.        ,
        0.        , 1.        , 1.        , 0.        , 0.        ],
       [0.5441225 , 0.7531114 , 0.55582774, 0.40665048, 0.5812186 ,
        0.9983736 , 0.65728694, 0.5209922 , 0.9983736 , 0.9983736 ]],
      dtype=float32)

In [9]:
dist.shape

(2, 10)

In [12]:
np.argmin(dist, axis=1)

array([5, 3], dtype=int64)

### Stream Cluster Method

In [None]:
C=stream_cluster(sorted_collection,  radius)
len(C)

In [None]:
sorted_collection=sort_csr_by_nonzero(sparse_docs)
results=[] #TODO: for debug purposes
best_C=None
best_radius=None
max_res=-2 #silhouette_score is between [-1, 1]
step=0.1
for radius in np.arange(0.1, 1, step):
    C=stream_cluster(sorted_collection,  radius)
    elems=np.array([])
    labels=[]
    for label, cluster in enumerate(C):
        tmp_len=len(cluster)
        elems=np.concatenate([elems,cluster], axis=0)
        labels+=[label]*tmp_len
    elems=vstack([e for e in elems])
    res=silhouette_score(elems, labels, metric='cosine')
    if res>max_res:
        best_C=C
        max_res=res
        best_radius=radius
    results.append(res)
#max_index=np.argmax(results)
#final_radius=max_index*step
print("Max silhouette score: ", max_res)
print("Best radius parameter: ", best_radius)

### DBSCAN method

In [None]:
n_iter=5
params_dbscan={"min_samples": [i for i in range(2, 35)],
        "eps": [i for i in np.arange(0.1, 5.1, 0.1)]}
dbscan=DBSCAN(metric="cosine")
searcher_dbscan=RandomizedSearchCV(dbscan, scoring=silhouette_score, n_iter=5, param_distributions=params_dbscan, cv=2, n_jobs=-1)
searcher_dbscan.fit(sparse_docs)
print("Best params: ", searcher_dbscan.best_params_)
print("Best silhouette_score: ", searcher_dbscan.best_score_)

In [None]:
core_indices=searcher_dbscan.best_estimator_.core_sample_indices_
core_points=sparse_docs[core_indices]
core_points_distances=cosine_distances(core_points)
dbscan_tsp=TSP_solver(core_points_distances)

In [None]:
#Get the labels given to the core samples (representative elements) in order to get the best cluster transversal ordering
dbscan_cluster_order=core_points.labels_[core_indices[dbscan_tsp]]

In [None]:
#for num in k_medoids_cluster_order:
np.nonzero(k_medoids.labels_==8) #-1 is the noise

In [None]:
#df["terms"].str.split(" ").apply(set)

In [None]:
""" print("DBSCAN Clustering")
params_dbscan={"min_samples": [i for i in range(2, 35)],
        "eps": [i for i in np.arange(0.1, 5, 0.1)]}
dbscan=DBSCAN(metric="cosine")
searcher_dbscan=RandomizedSearchCV(dbscan, scoring=silhouette_score, n_iter=n_iter, param_distributions=params_dbscan, cv=3, n_jobs=-1)
searcher_dbscan.fit(sparse_doc)
print("Best params: ", searcher_dbscan.best_params_)
print("Best silhouette_score: ", searcher_dbscan.best_score_) """

In [None]:
""" medoid_indices=searcher_kmedoids.best_estimator_.medoid_indices_
medoids=sparse_doc[medoid_indices]
medoid_distances=cosine_distances(medoids) """

In [None]:
""" k_medoids_tsp=TSP_solver(medoid_distances) """

In [None]:
""" DBSCAN_indices=searcher_dbscan.best_estimator_.core_sample_indices_
core_points=sparse_doc[DBSCAN_indices]
core_points_distances=cosine_distances(core_points) """

In [None]:
""" dbscan_tsp=TSP_solver(core_points_distances) """