In [2]:
import sys
sys.path.append('./src')

In [21]:
gc.enable()

In [3]:
from retrieval_system.likelihood import LikelihoodRetrieval
from retrieval_system.clustering import ClusterBasedRetrieval
from retrieval_system.lda import LdaRetrieval

from evaluation.metrics import calculate_average_precision

In [4]:
import pyterrier as pt

import re
import string
import gc
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

if not pt.started():
  pt.init()

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
import ir_datasets
dataset = ir_datasets.load("beir/scidocs")

In [6]:
docs_df = pd.DataFrame(columns=['text', 'docno'])

for doc in dataset.docs_iter():
    docs_df = docs_df.append({'text': doc.text, 'docno':doc.doc_id}, ignore_index=True)

In [7]:
queries_df = pd.DataFrame(columns=['query', 'qid'])

for query in dataset.queries_iter():
    queries_df = queries_df.append({'query': query.text, 'qid':query.query_id}, ignore_index=True)

In [8]:
qrels_df = pd.DataFrame(columns=['docno', 'qid', 'label'])
    
for qrel in dataset.qrels_iter():
    qrels_df = qrels_df.append({'docno': qrel.doc_id, 'qid': qrel.query_id, 'label': qrel.relevance}, ignore_index=True)

In [9]:
doc_ids = np.unique(docs_df["docno"])
querie_ids = np.unique(queries_df["qid"])

In [10]:
encoder = LabelEncoder()
encoder.fit(doc_ids)

docs_df["docno"] = encoder.transform(docs_df["docno"])
qrels_df["docno"] =  encoder.transform(qrels_df["docno"])

In [11]:
encoder = LabelEncoder()
encoder.fit(querie_ids)

queries_df["qid"] = encoder.transform(queries_df["qid"])
qrels_df["qid"] =  encoder.transform(qrels_df["qid"])

In [12]:
docs_df.head()

Unnamed: 0,text,docno
0,An evolutionary recurrent network which automa...,11706
1,Dynamic economic dispatch (DED) is one of the ...,15445
2,It's not surprisingly when entering this site ...,5010
3,"In this paper, we introduce a new parameter, c...",9626
4,This paper proposes a recurrent fuzzy neural n...,9725


In [13]:
queries_df.head()

Unnamed: 0,query,qid
0,A Direct Search Method to solve Economic Dispa...,523
1,Bearish-Bullish Sentiment Analysis on Financia...,554
2,Predicting defects in SAP Java code: An experi...,617
3,Active-Metric Learning for Classification of R...,262
4,Ad Hoc Retrieval Experiments Using WordNet and...,29


In [14]:
qrels_df.head()

Unnamed: 0,docno,qid,label
0,11706,523,1
1,15445,523,1
2,5010,523,1
3,9626,523,1
4,9725,523,1


In [15]:
Nds = docs_df["text"].map(lambda t: re.sub('[' + re.escape(string.punctuation) + ']', ' ', t).split(" ")).map(lambda l : list(filter(("").__ne__, l))).map(len)
Nds

0        249
1        147
2        102
3         37
4        150
        ... 
25652    199
25653    261
25654    141
25655     82
25656    233
Name: text, Length: 25657, dtype: int64

In [16]:
true_relevance = np.zeros((len(queries_df), len(docs_df)))

for _, row in qrels_df.iterrows():
    qid = row['qid']
    docno = row['docno']
    
    if row["label"] != 0:
        true_relevance[int(qid), int(docno)] = 1

In [17]:
sw = stopwords.words('english')

In [18]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=4, ngram_range=(1,1), stop_words=sw)
vectors = vectorizer.fit_transform(docs_df["text"])
queries = vectorizer.transform(queries_df["query"])

In [19]:
vectors.shape, queries.shape

((25657, 21050), (1000, 21050))

In [23]:
gc.collect()

0

## Varier K :

In [24]:
k_lda_ap = []

for K in tqdm(range(100, 1501, 100)):
    lda = LdaRetrieval(n_topics=K, mu=1000, alpha=None, beta=0.01, n_iter=30, nb_mc=2, lmbda=0.7)
    lda.fit(vectors, Nds=Nds)
    
    lda_pred = lda.predict(queries)
    k_lda_ap.append(calculate_average_precision(true_relevance, np.array([lda_pred]), num_thresholds=1, column_names=["LDA"])["LDA"].iloc[0])
    gc.collect()

## Comparaison QL CBDM LBDM :

In [76]:
ql = LikelihoodRetrieval()
ql.fit(vectors)
predictions_ql = ql.predict(queries)

In [15]:
cbdm = ClusterBasedRetrieval(n_topics=400, mu=1000)
cbdm.fit(vectors, Nds=Nds)
predictions_cbdm = cbdm.predict(queries)

In [18]:
lda = LdaRetrieval(n_topics=400, mu=1000, alpha=None, beta=0.01, n_iter=30, nb_mc=2, lmbda=0.7)
lda.fit(vectors, Nds=Nds)
predictions_lda = lda.predict(queries)