# Information Retrieval and Web Search
<p>
Course Project - Clustering documents to compress inverted index<br>
Giovanni Costa - 880892
</p>

In [37]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import save_npz, load_npz, vstack
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import RandomizedSearchCV
from utils import parse_file_list, get_tfidf_repr, TSP_solver, stream_cluster, sort_csr_by_nonzero

input_path="input/"
output_path="output/"

## Data parsing and TF-IDF representation

In [None]:
df=parse_file_list()
print("Dataframe info:")
df.info()

In [None]:
sparse_docs, tf_idf_vocab=get_tfidf_repr(df)
print("TF-IDF info:")
print("Shape: ", sparse_docs.shape)
print("Size in MB: {:.3f} ".format(sparse_docs.data.nbytes/ (1024**2)))
save_npz(input_path+"sparse_tf-idf.npz", sparse_docs)
with open(input_path+"tf-idf_vocab.pkl", "wb") as file:
    pickle.dump(tf_idf_vocab, file)

## Clustering and hyperparameters tuning

In [None]:
sparse_docs=load_npz(input_path+'sparse_tf-idf.npz')
tf_idf_vocab=None
with open(input_path+'tf-idf_vocab.pkl', 'rb') as file:
    tf_idf_vocab=pickle.load(file)

In [None]:
test=sparse_docs[:10,:]

In [None]:
matr=sort_csr_by_nonzero(test)
C=stream_cluster(matr,  0.5)

In [40]:
elems=np.array([])
labels=[]
for label, cluster in enumerate(C):
    tmp_len=len(cluster)
    elems=np.concatenate([elems,cluster], axis=0)
    labels+=[label]*tmp_len
elems=vstack([e for e in elems])

In [42]:
silhouette_score(elems, labels, metric='cosine')

0.08855875

In [None]:
k_medoids.medoid_indices_

In [None]:
medoid_indices=k_medoids.medoid_indices_
medoids=sparse_docs[medoid_indices]
medoid_distances=cosine_distances(medoids)
k_medoids_tsp=TSP_solver(medoid_distances)

In [None]:
k_medoids_cluster_order=k_medoids.labels_[medoid_indices[k_medoids_tsp]]

In [None]:
#for num in k_medoids_cluster_order:
np.nonzero(k_medoids.labels_==8)

In [None]:
#df["terms"].str.split(" ").apply(set)

In [None]:
sort_csr_by_nonzero(test)

In [None]:
""" print("DBSCAN Clustering")
params_dbscan={"min_samples": [i for i in range(2, 35)],
        "eps": [i for i in np.arange(0.1, 5, 0.1)]}
dbscan=DBSCAN(metric="cosine")
searcher_dbscan=RandomizedSearchCV(dbscan, scoring=silhouette_score, n_iter=n_iter, param_distributions=params_dbscan, cv=3, n_jobs=-1)
searcher_dbscan.fit(sparse_doc)
print("Best params: ", searcher_dbscan.best_params_)
print("Best silhouette_score: ", searcher_dbscan.best_score_) """

In [None]:
""" medoid_indices=searcher_kmedoids.best_estimator_.medoid_indices_
medoids=sparse_doc[medoid_indices]
medoid_distances=cosine_distances(medoids) """

In [None]:
""" k_medoids_tsp=TSP_solver(medoid_distances) """

In [None]:
""" DBSCAN_indices=searcher_dbscan.best_estimator_.core_sample_indices_
core_points=sparse_doc[DBSCAN_indices]
core_points_distances=cosine_distances(core_points) """

In [None]:
""" dbscan_tsp=TSP_solver(core_points_distances) """