# Information Retrieval and Web Search
<p>
Course Project - Clustering documents to compress inverted index<br>
Giovanni Costa - 880892
</p>

In [None]:
import pandas as pd
import numpy as np
import pickle, multiprocessing
from scipy.sparse import save_npz, load_npz
from sklearn.cluster import DBSCAN, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.datasets import fetch_rcv1
from utils import parse_data_files, get_tfidf_repr, random_search
from Indexer import Indexer, EXIT_NUMBER_DOCS

input_path="input/"
output_path="output/"
CORE_NUM=multiprocessing.cpu_count()
np.random.seed(42)

## Data parsing and TF-IDF representation

In [None]:
sparse_docs, _=fetch_rcv1(return_X_y=True)
sparse_docs=sparse_docs.astype("float32")
print(sparse_docs.shape)

In [None]:
indexer=Indexer()
std_inverted_index=indexer.get_dict_from_csr_matrix(sparse_docs)

In [None]:
len(std_inverted_index)

In [None]:
np.unique(sparse_docs.nonzero()[1]).shape

In [None]:
count=0
for k, v in std_inverted_index.items():
    if v[0]==0:
        count+=1
print(count)

### MiniBatch K-Means Method

In [None]:
n_iter=15
params_k_means={"n_clusters": [i for i in range(2, 101)]}
k_means_obj=MiniBatchKMeans(batch_size=256*CORE_NUM, n_init="auto") #For faster computations, you can set the batch_size greater than 256 * number of cores to enable parallelism on all cores

In [None]:
best_k_means, best_k_means_remapping=random_search(k_means_obj, sparse_docs, std_inverted_index, params_k_means, n_iter)

### DBSCAN method

In [None]:
n_iter=1
params_dbscan={"min_samples": [i for i in range(2, 21)],
                "eps": [i for i in np.arange(0.05, 3.05, 0.05)]}
dbscan_obj=DBSCAN(metric="cosine")

In [None]:
best_dbscan, best_dbscan_remapping=random_search(dbscan_obj, sparse_docs, std_inverted_index, params_dbscan, n_iter)