In [1]:
import sys
sys.path.append('./src')

In [2]:
from retrieval_system.likelihood import LikelihoodRetrieval
from retrieval_system.clustering import ClusterBasedRetrieval
from retrieval_system.lda import LdaRetrieval

from evaluation.metrics import calculate_average_precision

In [3]:
import re
import string
import gc
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

gc.enable()

In [4]:
import ir_datasets
dataset = ir_datasets.load("beir/scidocs")

In [5]:
docs_df = pd.DataFrame(columns=['text', 'docno'])

for doc in dataset.docs_iter():
    docs_df = docs_df.append({'text': doc.text, 'docno':doc.doc_id}, ignore_index=True)

In [6]:
queries_df = pd.DataFrame(columns=['query', 'qid'])

for query in dataset.queries_iter():
    queries_df = queries_df.append({'query': query.text, 'qid':query.query_id}, ignore_index=True)

In [7]:
qrels_df = pd.DataFrame(columns=['docno', 'qid', 'label'])
    
for qrel in dataset.qrels_iter():
    qrels_df = qrels_df.append({'docno': qrel.doc_id, 'qid': qrel.query_id, 'label': qrel.relevance}, ignore_index=True)
    
qrels_df = qrels_df[qrels_df["label"] != 0]

In [8]:
doc_ids = np.unique(docs_df["docno"])
querie_ids = np.unique(queries_df["qid"])

encoder = LabelEncoder()
encoder.fit(doc_ids)

docs_df["docno"] = encoder.transform(docs_df["docno"])
qrels_df["docno"] =  encoder.transform(qrels_df["docno"])

encoder = LabelEncoder()
encoder.fit(querie_ids)

queries_df["qid"] = encoder.transform(queries_df["qid"])
qrels_df["qid"] =  encoder.transform(qrels_df["qid"])

In [9]:
docs_df["docno"]

0        11706
1        15445
2         5010
3         9626
4         9725
         ...  
25652    16845
25653    10603
25654    15466
25655     5027
25656    23073
Name: docno, Length: 25657, dtype: int64

In [10]:
docs_df.head()

Unnamed: 0,text,docno
0,An evolutionary recurrent network which automa...,11706
1,Dynamic economic dispatch (DED) is one of the ...,15445
2,It's not surprisingly when entering this site ...,5010
3,"In this paper, we introduce a new parameter, c...",9626
4,This paper proposes a recurrent fuzzy neural n...,9725


In [11]:
queries_df.head()

Unnamed: 0,query,qid
0,A Direct Search Method to solve Economic Dispa...,523
1,Bearish-Bullish Sentiment Analysis on Financia...,554
2,Predicting defects in SAP Java code: An experi...,617
3,Active-Metric Learning for Classification of R...,262
4,Ad Hoc Retrieval Experiments Using WordNet and...,29


In [12]:
qrels_df.head()

Unnamed: 0,docno,qid,label
0,11706,523,1
1,15445,523,1
2,5010,523,1
3,9626,523,1
4,9725,523,1


## Réduire la taille des données : 

### 1. Sample N docs : 

In [13]:
N_docs = 10_000
indices = np.arange(len(docs_df))
np.random.shuffle(indices)

docs_df_reduce = docs_df.iloc[indices[0:N_docs]].reset_index(drop=True)
doc_ids = np.unique(docs_df_reduce["docno"])
doc_ids.shape

(10000,)

Mettre à jour le qrel avec les nouveaux doc retenus : 

In [14]:
qrels_df_1 = qrels_df[qrels_df["docno"].isin(doc_ids)]
query_ids = np.unique(qrels_df_1["qid"])
query_ids.shape

(920,)

Mettre à jour les queries avec celles qui existent dans qrels : 

In [15]:
queries_df_1 = queries_df[queries_df["qid"].isin(query_ids)]
queries_df_1.shape

(920, 2)

### 2. Sample N queries : 

In [16]:
N_queries = 90
indices = np.arange(len(queries_df_1))
queries_df_reduce = queries_df_1.iloc[indices[0:N_queries]].reset_index(drop=True)
query_ids = np.unique(queries_df_reduce["qid"])
query_ids.shape

(90,)

Mettre à jour le qrel avec les nouvelles query retenues : 

In [17]:
qrels_df_2 = qrels_df_1[qrels_df_1["qid"].isin(query_ids)]
qrels_df_2.shape

(179, 3)

### 3. Mettre à jour les df :

In [18]:
docs_df = docs_df_reduce
queries_df = queries_df_reduce
qrels_df = qrels_df_2

### 4. Re-coder les id :

In [19]:
doc_ids = np.unique(docs_df["docno"])
querie_ids = np.unique(queries_df["qid"])

encoder = LabelEncoder()
encoder.fit(doc_ids)

docs_df["docno"] = encoder.transform(docs_df["docno"])
qrels_df["docno"] =  encoder.transform(qrels_df["docno"])

encoder = LabelEncoder()
encoder.fit(querie_ids)

queries_df["qid"] = encoder.transform(queries_df["qid"])
qrels_df["qid"] =  encoder.transform(qrels_df["qid"])

In [20]:
Nds = docs_df["text"].map(lambda t: re.sub('[' + re.escape(string.punctuation) + ']', ' ', t).split(" ")).map(lambda l : list(filter(("").__ne__, l))).map(len)
Nds

0       153
1       291
2       104
3         0
4       220
       ... 
9995     71
9996    145
9997    143
9998     79
9999    167
Name: text, Length: 10000, dtype: int64

In [21]:
true_relevance = np.zeros((len(queries_df), len(docs_df)))

for _, row in qrels_df.iterrows():
    qid = row['qid']
    docno = row['docno']
    
    if row["label"] != 0:
        true_relevance[int(qid), int(docno)] = 1

In [22]:
qrels_df

Unnamed: 0,docno,qid,label
4,3784,45,1
34,716,50,1
61,2420,55,1
120,2620,3,1
121,8364,3,1
...,...,...,...
2816,7572,73,1
2842,7935,72,1
2843,9162,72,1
2845,5421,72,1


In [23]:
true_relevance[47, 3786]

0.0

In [24]:
sw = stopwords.words('english')

In [25]:
vectorizer = CountVectorizer(stop_words=sw, max_df=0.95, min_df=4)
vectors = vectorizer.fit_transform(docs_df["text"])
queries = vectorizer.transform(queries_df["query"])

In [26]:
vectors.shape, queries.shape

((10000, 13014), (90, 13014))

In [27]:
p_vectors = np.asarray(vectors.toarray()) + np.finfo(float).eps
p_vectors /= p_vectors.sum(axis=1)[:, np.newaxis]
vectors = p_vectors

In [28]:
p_queries = np.asarray(queries.toarray()) + np.finfo(float).eps
p_queries /= p_queries.sum(axis=1)[:, np.newaxis]
queries = p_queries

In [29]:
gc.collect()

0

## Varier K :

In [24]:
k_lda_ap = []

for K in tqdm(range(100, 1501, 100)):
    lda = LdaRetrieval(n_topics=K, mu=1000, alpha=None, beta=0.01, n_iter=30, nb_mc=2, lmbda=0.7)
    lda.fit(vectors, Nds=Nds)
    
    lda_pred = lda.predict(queries)
    k_lda_ap.append(calculate_average_precision(true_relevance, np.array([lda_pred]), num_thresholds=1, column_names=["LDA"])["LDA"].iloc[0])
    gc.collect()

## Comparaison QL CBDM LBDM :

In [30]:
ql = LikelihoodRetrieval()
ql.fit(vectors)
predictions_ql = ql.predict(queries)

In [31]:
cbdm = ClusterBasedRetrieval(n_topics=100, mu=1000)
cbdm.fit(vectors, Nds=Nds)
predictions_cbdm = cbdm.predict(queries)

KeyboardInterrupt: 

In [None]:
lda = LdaRetrieval(n_topics=100, mu=1000, alpha=None, beta=0.01, n_iter=20, nb_mc=2, lmbda=0.7)
lda.fit(vectors, Nds=Nds)
predictions_lda = lda.predict(queries)

In [None]:
calculate_average_precision(true_relevance, np.array([predictions_ql, predictions_cbdm, predictions_lda]), num_thresholds=1, column_names=["QL", "CBDM", "LDA"])