# Learning with Massive Data
<p>
Assignment 3 - Similarity search for document pairs<br>
Giovanni Costa - 880892
</p>

<p>
<b>SEQUENTIAL VERSION</b>
</p>

Contents:
- [Document sparse representation](#doc_repr)
- [Sequential Implementation](#s_impl)
    - [Exact similarity search](#exact_s)
    - [Approximate similarity search](#approx_s)
- [Evaluations](#eval)

In [1]:
import pandas as pd
import numpy as np
from utils import compute_sparse_repr, eval_sol, compute_cosine_similarity
from scipy.sparse import load_npz, save_npz
from matplotlib import pyplot as plt
from sklearn.random_projection import SparseRandomProjection

<a id="doc_repr"></a>
## Document sparse representation

In [2]:
results="results/"
datasets=["datasets/nfcorpus/corpus.jsonl", "datasets/scifact/corpus.jsonl"]

In [3]:
df_docs1=pd.read_json(datasets[0], lines=True)
df_docs2=pd.read_json(datasets[1], lines=True)

In [4]:
df_docs1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3633 entries, 0 to 3632
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   _id       3633 non-null   object
 1   title     3633 non-null   object
 2   text      3633 non-null   object
 3   metadata  3633 non-null   object
dtypes: object(4)
memory usage: 113.7+ KB


In [5]:
df_docs2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5183 entries, 0 to 5182
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   _id       5183 non-null   int64 
 1   title     5183 non-null   object
 2   text      5183 non-null   object
 3   metadata  5183 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.1+ KB


In [6]:
sparse_repr1, vocab1, idf=compute_sparse_repr(df_docs1["text"])
#sparse_repr2, vocab2=compute_sparse_repr(df_docs2["text"])

In [7]:
sparse_repr1.shape

(3633, 18867)

In [8]:
#pd.DataFrame(vocab_1, columns=["terms"]).to_parquet("terms_nfcorpus.parquet")
pd.DataFrame(df_docs1["_id"]).to_parquet(results+"ids_test.parquet")
save_npz(results+"test.npz", sparse_repr1)


<a id="s_impl"></a>
## Sequential Implementation
<a id="exact_s"></a>
### Exact similarity search

In [9]:
sparse_repr1=load_npz(results+"test.npz")
ids=pd.read_parquet(results+"ids_test.parquet")["_id"]
print(sparse_repr1.shape)
print("Density ratio:", sparse_repr1.count_nonzero()/(sparse_repr1.shape[0]*sparse_repr1.shape[1]))

(3633, 18867)
Density ratio: 0.006536315875404127


In [10]:
ids

0        MED-10
1        MED-14
2       MED-118
3       MED-301
4       MED-306
         ...   
3628    MED-917
3629    MED-941
3630    MED-942
3631    MED-952
3632    MED-961
Name: _id, Length: 3633, dtype: object

In [11]:
pairs, num_pairs=compute_cosine_similarity(sparse_repr1, ids, 0.8)

In [12]:
list_pair=np.unique(np.array([[e[0], e[1]] for e in pairs]).flatten())
list_pair

array(['MED-1106', 'MED-118', 'MED-1229', 'MED-1874', 'MED-1935',
       'MED-2100', 'MED-2123', 'MED-2155', 'MED-2170', 'MED-2189',
       'MED-2205', 'MED-2432', 'MED-2526', 'MED-2651', 'MED-2769',
       'MED-2781', 'MED-2787', 'MED-2904', 'MED-2905', 'MED-2907',
       'MED-2910', 'MED-2921', 'MED-2944', 'MED-2951', 'MED-2977',
       'MED-3019', 'MED-3026', 'MED-3029', 'MED-3032', 'MED-306',
       'MED-3193', 'MED-3207', 'MED-3220', 'MED-3235', 'MED-3292',
       'MED-3309', 'MED-334', 'MED-335', 'MED-3485', 'MED-3787',
       'MED-3811', 'MED-3815', 'MED-3830', 'MED-3833', 'MED-3834',
       'MED-3841', 'MED-3874', 'MED-3885', 'MED-3886', 'MED-3897',
       'MED-3907', 'MED-398', 'MED-4206', 'MED-4247', 'MED-4255',
       'MED-4393', 'MED-4416', 'MED-4517', 'MED-4598', 'MED-4599',
       'MED-4603', 'MED-4613', 'MED-4615', 'MED-4616', 'MED-4617',
       'MED-4620', 'MED-4633', 'MED-4639', 'MED-4641', 'MED-4673',
       'MED-4674', 'MED-4687', 'MED-4689', 'MED-4820', 'MED-4885',


In [34]:
df=1/idf

In [13]:
tmp_idx=[]
for (idx, elem) in enumerate(ids.values):
    if elem in ['MED-1874', 'MED-2432']:
        tmp_idx.append(idx)

In [14]:
tmp_idx

[930, 1376]

In [15]:
len(tmp_idx)

2

In [39]:
pd.DataFrame(df, columns=["df_t"]).to_parquet("results/test_df.parquet")

In [16]:
bug_set=eval("{'MED-3485', 'MED-3841', 'MED-4616', 'MED-2170', 'MED-2907', 'MED-2781', 'MED-334', 'MED-2910', 'MED-2944', 'MED-4247', 'MED-4639', 'MED-724', 'MED-4255', 'MED-3220', 'MED-2787', 'MED-2769', 'MED-398', 'MED-4990', 'MED-3833', 'MED-4617', 'MED-306', 'MED-5301', 'MED-3787', 'MED-4613', 'MED-4820', 'MED-3907', 'MED-3292', 'MED-4673', 'MED-4977', 'MED-756', 'MED-3207', 'MED-3886', 'MED-4615', 'MED-4393', 'MED-5359', 'MED-4885', 'MED-4206', 'MED-2651', 'MED-4603', 'MED-5244', 'MED-2155', 'MED-118', 'MED-4620', 'MED-2977', 'MED-3811', 'MED-3885', 'MED-3897', 'MED-3029', 'MED-5342', 'MED-2123', 'MED-4416', 'MED-4687', 'MED-3815', 'MED-4892', 'MED-4988', 'MED-2526', 'MED-3235', 'MED-4633', 'MED-3193', 'MED-3019', 'MED-4517', 'MED-719', 'MED-3309', 'MED-4689', 'MED-2921', 'MED-4674', 'MED-2951', 'MED-4598', 'MED-335', 'MED-1229', 'MED-3874', 'MED-1106', 'MED-5010', 'MED-1935', 'MED-4599', 'MED-5225'}")

In [17]:
set_correct=set(list_pair)

In [18]:
set_correct.difference(bug_set)

#DOC NOT FOUND EX PAIRS:
# ('MED-1874', 'MED-2432', 1.0)
# ('MED-2100', 'MED-4641', 1.0),	

{'MED-1874',
 'MED-2100',
 'MED-2189',
 'MED-2205',
 'MED-2432',
 'MED-2904',
 'MED-2905',
 'MED-3026',
 'MED-3032',
 'MED-3830',
 'MED-3834',
 'MED-4641'}

In [19]:
pd.DataFrame(ids[tmp_idx]).to_parquet(results+"bug_pair_ids.parquet")

In [20]:
save_npz(results+"bug_pair_ids.npz", sparse_repr1[tmp_idx, :])

In [21]:
sparse_repr1[tmp_idx, :]

<2x18867 sparse matrix of type '<class 'numpy.float64'>'
	with 276 stored elements in Compressed Sparse Row format>

In [22]:
df_docs1.iloc[tmp_idx]

Unnamed: 0,_id,title,text,metadata
930,MED-1874,"The Garden of Eden--plant based diets, the gen...",It is likely that plant food consumption throu...,{'url': 'http://www.ncbi.nlm.nih.gov/pubmed/?t...
1376,MED-2432,"The Garden of Eden--plant based diets, the gen...",It is likely that plant food consumption throu...,{'url': 'http://www.ncbi.nlm.nih.gov/pubmed/14...


In [23]:
sparse_repr1[tmp_idx[0],:].dot(sparse_repr1[tmp_idx[1],:].transpose()).toarray()

array([[1.]])

In [24]:
sparse_repr1[tmp_idx[1],:].indices

array([  910,  1947,  1988,  2082,  2281,  2432,  2592,  2982,  3033,
        3324,  3334,  3423,  3450,  3637,  3719,  3752,  4000,  4108,
        4203,  4628,  5022,  5041,  5048,  5112,  5143,  5149,  5235,
        6005,  6007,  6023,  6151,  6515,  6516,  6837,  6966,  7035,
        7158,  7229,  7380,  7481,  7554,  7706,  7716,  7736,  7847,
        7913,  8045,  8052,  8551,  8606,  8791,  9002,  9141,  9471,
        9698,  9763,  9770,  9817,  9899,  9961, 10332, 10401, 10411,
       10490, 10526, 10538, 10585, 10726, 10727, 10871, 10916, 10958,
       11101, 11119, 11385, 11559, 11687, 11758, 11872, 12069, 12521,
       12573, 12635, 12677, 12759, 12991, 13030, 13332, 13624, 13753,
       13859, 13992, 14064, 14268, 14369, 14464, 14560, 14709, 14775,
       14811, 14827, 14942, 14972, 14973, 15033, 15112, 15150, 15284,
       15418, 15483, 15525, 15783, 15831, 15948, 16196, 16383, 16408,
       16454, 16604, 16642, 16758, 16955, 16957, 17086, 17170, 17174,
       17196, 17285,

<a id="approx_s"></a>
### Approximate similarity search
(using Sparse Random Projection)

In [25]:
epsilon=0.1

In [26]:
sr_proj1=SparseRandomProjection(eps=epsilon, random_state=32)
sr_proj1.fit(sparse_repr1);
print(sr_proj1.n_components_)
print(sr_proj1.density_)

7026
0.0072802882585279016


In [27]:
sparse_repr_approx_srp1=sr_proj1.transform(sparse_repr1)
print(sparse_repr_approx_srp1.shape)
print("Density ratio:", sparse_repr_approx_srp1.count_nonzero()/(sparse_repr_approx_srp1.shape[0]*sparse_repr_approx_srp1.shape[1]))

(3633, 7026)
Density ratio: 0.5847442580658102


In [28]:
sr_proj2=SparseRandomProjection(eps=epsilon, random_state=32)
sr_proj2.fit(sparse_repr2);
print(sr_proj2.n_components_)
print(sr_proj2.density_)


KeyboardInterrupt



In [None]:
sparse_repr_approx_srp2=sr_proj2.transform(sparse_repr2)
print(sparse_repr_approx_srp2.shape)
print("Density ratio:", sparse_repr_approx_srp2.count_nonzero()/(sparse_repr_approx_srp2.shape[0]*sparse_repr_approx_srp2.shape[1]))

In [None]:
time_list1_appprox, pairs_list1_approx=eval_sol(sparse_repr_approx_srp1, df_docs1["_id"], thresholds)

In [None]:
time_list2_appprox, pairs_list2_approx=eval_sol(sparse_repr_approx_srp2, df_docs2["_id"], thresholds)

<a id="eval"></a>
## Results

In [None]:
for i in range(len(thresholds)):
    print(f"Threshold: {thresholds[i]}")
    exact_set1=set([(e[0], e[1]) for e in pairs_list1[i]])
    approx_set1=set([(e[0], e[1]) for e in pairs_list1_approx[i]])
    jaccard=len(exact_set1.intersection(approx_set1))/len(exact_set1.union(approx_set1))
    print(f"Jaccard score: {jaccard}")
    print(f"Time for exact solution: {time_list1[i]}")
    print(f"Time for approx solution: {time_list1_appprox[i]}\n")


In [None]:
for i in range(len(thresholds)):
    print(f"Threshold: {thresholds[i]}")
    exact_set2=set([(e[0], e[1]) for e in pairs_list2[i]])
    approx_set2=set([(e[0], e[1]) for e in pairs_list2_approx[i]])
    jaccard=len(exact_set2.intersection(approx_set2))/len(exact_set2.union(approx_set2))
    print(f"Jaccard score: {jaccard}")
    print(f"Time for exact solution: {time_list2[i]}")
    print(f"Time for approx solution: {time_list2_appprox[i]}\n")
