# Information Retrieval and Web Search
<p>
Course Project - Clustering documents to compress inverted index<br>
Giovanni Costa - 880892
</p>

In [None]:
import pandas as pd
import numpy as np
import pickle, multiprocessing
from scipy.sparse import save_npz, load_npz
from sklearn.cluster import DBSCAN, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from utils import parse_data_files, get_tfidf_repr, random_search
from Indexer import Indexer, EXIT_NUMBER_DOCS

input_path="input/"
output_path="output/"
CORE_NUM=multiprocessing.cpu_count()
np.random.seed(42)

## Data parsing and TF-IDF representation

In [None]:
df=parse_data_files()
print("Dataframe info:")
df.info()

In [None]:
sparse_docs, tf_idf_vocab=get_tfidf_repr(df.iloc[:EXIT_NUMBER_DOCS])
print("TF-IDF info:")
print("Shape: ", sparse_docs.shape)
print("Size in MB: {:.3f} ".format(sparse_docs.data.nbytes/ (1024**2)))
save_npz(input_path+"sparse_tf-idf.npz", sparse_docs)
with open(input_path+"tf-idf_vocab.pkl", "wb") as file:
    pickle.dump(tf_idf_vocab, file)
df.to_parquet(input_path+"df.parquet")

## Clustering and hyperparameters tuning

In [None]:
sparse_docs_1=load_npz(input_path+'sparse_tf-idf.npz')
tf_idf_vocab=None
with open(input_path+'tf-idf_vocab.pkl', 'rb') as file:
    tf_idf_vocab=pickle.load(file)
df=pd.read_parquet(input_path+"df.parquet")

In [None]:
test=sparse_docs_1#[:EXIT_NUMBER_DOCS,:]
sparse_docs=test
#sparse_doc=sparse_docs_1

In [None]:
indexer=Indexer()
indexer.get_dict_from_csr_matrix(sparse_docs)

In [None]:
trunc_svd=TruncatedSVD(n_components=100, random_state=42) #For LSA, a value of 100 is recommended.
sparse_docs_approx=trunc_svd.fit_transform(sparse_docs)
print("Current shape: ", sparse_docs_approx.shape)

In [None]:
sparse_docs=sparse_docs_approx

In [None]:
indexed=Indexer()
std_inverted_index=indexed.get_dict()

### MiniBatch K-Means Method

In [None]:
n_iter=15
params_k_means={"n_clusters": [i for i in range(2, 101)]}
k_means_obj=MiniBatchKMeans(batch_size=256*CORE_NUM, n_init="auto") #For faster computations, you can set the batch_size greater than 256 * number of cores to enable parallelism on all cores

In [None]:
best_k_means=random_search(k_means_obj, sparse_docs, std_inverted_index, params_k_means, n_iter)

### DBSCAN method

In [None]:
n_iter=15
params_dbscan={"min_samples": [i for i in range(2, 21)],
                "eps": [i for i in np.arange(0.05, 3.05, 0.05)]}
dbscan_obj=DBSCAN(metric="cosine")

In [None]:
sparse_docs.tocsc().getcol(10).nonzero()

In [None]:
(np.unique(sparse_docs.nonzero()[1])== np.array(range(sparse_docs.shape[1]))).sum()

In [None]:
best_dbscan=random_search(dbscan_obj, sparse_docs, std_inverted_index, params_dbscan, n_iter)