# Project 1 - IR CIn UFPE 2022

Authors:

Matheus Rodrigues de Souza Félix (matheusrdgsf@gmail.com) \
Rodrigo Melo

### Libs

In [2]:
import os
import pandas as pd
from git import Repo
import gzip
from io import BytesIO
import gzip

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import unidecode

nltk.download('punkt')
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer

from scipy import spatial
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import precision_score, f1_score, recall_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Matheus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Matheus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Collect

In [3]:
DATA_PATH = "dataset/scifact-generated-queries"

In [4]:
if not os.path.isdir(DATA_PATH):
    Repo.clone_from("https://huggingface.co/datasets/BeIR/scifact-generated-queries", "dataset/scifact-generated-queries")

In [5]:
with gzip.open(DATA_PATH+"/train.jsonl.gz", 'rb') as f:
    file_content = f.read()

In [6]:
dataset = pd.read_json(BytesIO(file_content), lines=True)

In [7]:
dataset.groupby(["title"]).size().reset_index(name='queries_count')

Unnamed: 0,title,queries_count
0,PHENIX: a comprehensive Python...,3
1,DNA methylation and healthy human aging,3
2,US-SOMO HPLC-SAXS module: dealing with capill...,3
3,"""Grazing"": a high-risk behavior.",3
4,"""Natural"" killer cells in the mouse. I. Cytoto...",3
...,...,...
5176,β-catenin-independent WNT signaling in basal-l...,3
5177,β-site amyloid precursor protein-cleaving enzy...,3
5178,β1 integrin mediates an alternative survival p...,3
5179,‘Short stature in children - a questionnaire f...,3


## Pre-rocesing Data

In this step we will create 5 data models:

v1 - Only Tokenization; \
v2 - Only Stopword Filter; \
v3 - Only Stemming; \
v4 - Remove Stopwords and Stemming; \
v5 - Remove Stopworpd and expand words with Synonyms.


In [8]:
def remove_accent(text):
    return unidecode.unidecode(text)

def tokenize(text):
    return word_tokenize(text, language="english")

def pre_process(text, rmv_sw, stem):
    text_lower = text.lower()
    text_rmv_accent = remove_accent(text_lower)
    text_final = tokenize(text_rmv_accent)
    
    if rmv_sw:
        text_final = list(filter(lambda token: token not in STOPWORDS, text_final))
    
    if stem:
        stemmer = SnowballStemmer("english")
        text_final = list(map(lambda token: stemmer.stem(token), text_final))
        
    return text_final

STOPWORDS = set(map(lambda token: remove_accent(token), stopwords.words("english")))

In [9]:
pre_process_v1 = lambda text: pre_process(text, False, False)
pre_process_v2 = lambda text: pre_process(text, True, False)
pre_process_v3 = lambda text: pre_process(text, False, True)
pre_process_v4 = lambda text: pre_process(text, True, True)
#pre_process_v5 = lambda i: pre_process(text, True, False)

In [32]:
dataset_pcrs = dataset.copy().drop("query", axis=1).drop_duplicates().head(50)

In [33]:
dataset_pcrs["v1"] = dataset_pcrs["text"].apply(lambda text: pre_process_v1(text))

In [34]:
dataset_pcrs["v2"] = dataset_pcrs["text"].apply(lambda text: pre_process_v2(text))

In [35]:
dataset_pcrs["v3"] = dataset_pcrs["text"].apply(lambda text: pre_process_v3(text))

In [36]:
dataset_pcrs["v4"] = dataset_pcrs["text"].apply(lambda text: pre_process_v4(text))

In [37]:
# @TODO: Create Expand Vocabulary Pipeline add connect pre_process
# dataset_pcrs["v5"] = dataset_pcrs["text"].apply(lambda text: pre_process_v5(text))

In [38]:
# Be careful with dataset_pcrs Len
#HTML(dataset_pcrs.head(3).to_html())

# @TODO: add "()" to STOPWORDS. A better strategy is create a dict simbol to filter and add this pipeline.
dataset_pcrs.head(2)

Unnamed: 0,_id,title,text,v1,v2,v3,v4
0,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,"[alterations, of, the, architecture, of, cereb...","[alterations, architecture, cerebral, white, m...","[alter, of, the, architectur, of, cerebr, whit...","[alter, architectur, cerebr, white, matter, de..."
3,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,"[myelodysplastic, syndromes, (, mds, ), are, a...","[myelodysplastic, syndromes, (, mds, ), age-de...","[myelodysplast, syndrom, (, mds, ), are, age-d...","[myelodysplast, syndrom, (, mds, ), age-depend..."


### Vectorizing Data

This step defines 5 (vectorizer, corpus_vetorized) models.

In [39]:
def get_documments(df_column):
    return list(map(lambda tokenized_text: " ".join(tokenized_text), df_column))

In [40]:
corpus_v1 = get_documments(dataset_pcrs["v1"])
corpus_v2 = get_documments(dataset_pcrs["v2"])
corpus_v3 = get_documments(dataset_pcrs["v3"])
corpus_v4 = get_documments(dataset_pcrs["v4"])
# corpus_v5 = get_documments(dataset_pcrs["v5"])

In [41]:
vectorizer_v1 = TfidfVectorizer()
corpus_v1_vct = vectorizer_v1.fit_transform(corpus_v1)

vectorizer_v2 = TfidfVectorizer()
corpus_v2_vct = vectorizer_v2.fit_transform(corpus_v2)

vectorizer_v3 = TfidfVectorizer()
corpus_v3_vct = vectorizer_v3.fit_transform(corpus_v3)

vectorizer_v4 = TfidfVectorizer()
corpus_v4_vct = vectorizer_v4.fit_transform(corpus_v4)

# vectorizer_v5 = TfidfVectorizer()
# corpus_v1_vct = vectorizer_v5.fit_transform(corpus_v5)

### Retrieval Information Function

In [42]:
# kd -> KDTree, nn -> Nearest Neighbor, bf -> Brute Force
# n -> number of docs in return, use -1 to all docs
def info_retrieval(pre_process, corpus, vectorizer, query, n=2, matcher="kd"):
    
    query = " ".join(pre_process(query))
    query_vct = vectorizer.transform([query])
    
    if n == -1:
        
        n = corpus.shape[0]
    
    if matcher == "kd":
        
        kdtree = scipy.spatial.KDTree(corpus_v1_vct.todense())
        
        # p is Minkowski p-norm.
        # p = 1, Manhattan Distance
        # p = 2, Euclidean Distance
        # p = +inf, Chebychev Distance
        distance, index = kdtree.query(query_vct.todense(), n, p=1)
        
    elif matcher == "nn":
        
        nbrs = NearestNeighbors(n_neighbors=n, algorithm="ball_tree").fit(corpus)
        distance, index = nbrs.kneighbors(query_vct)
        
    elif matcher == "bf":

        nbrs = NearestNeighbors(n_neighbors=n, algorithm="brute", metric="cosine").fit(corpus)
        distance, index = nbrs.kneighbors(query_vct)
        
    else:
        
        return "Matcher strategy not avaliable. Set kd to KDTree, nn to Nearest Neighbor and bf to Brute Force"
    
    return list(zip(distance.tolist()[0], index.tolist()[0]))

In [43]:
ir_v1 = lambda query: info_retrieval(pre_process_v1, corpus_v1_vct, vectorizer_v1, query, n=-1, matcher = "bf")
ir_v2 = lambda query: info_retrieval(pre_process_v2, corpus_v2_vct, vectorizer_v2, query, n=-1, matcher = "bf")
ir_v3 = lambda query: info_retrieval(pre_process_v3, corpus_v3_vct, vectorizer_v3, query, n=-1, matcher = "bf")
ir_v4 = lambda query: info_retrieval(pre_process_v4, corpus_v4_vct, vectorizer_v4, query, n=-1, matcher = "bf")
# ir_v5 = lambda query: info_retrieval(pre_process_v5, corpus_v5_vct, vectorizer_v5, query, n=-1, matcher = "bf")

### Snippet Test

In [44]:
query = "Alterations of the architecture of cerebral white matter in the developing human brain"

In [45]:
result = sorted(ir_v1(query), key=lambda i:i[0])

In [46]:
result

[(0.5512980389649619, 0),
 (0.9018937692186384, 4),
 (0.9030226639226278, 8),
 (0.904491849656119, 22),
 (0.9045364304922894, 3),
 (0.9055550155037727, 15),
 (0.9059548306703978, 40),
 (0.9107837002564602, 23),
 (0.9180953622326151, 7),
 (0.9188460324359904, 10),
 (0.9197344396945608, 37),
 (0.9204179861579849, 26),
 (0.9209788244798877, 47),
 (0.924605936837051, 34),
 (0.9257283763325889, 14),
 (0.9301273361039335, 46),
 (0.930343144950218, 20),
 (0.9307127649225015, 25),
 (0.9309219855750911, 36),
 (0.9310791524131075, 17),
 (0.9323432398604214, 21),
 (0.9333678935374439, 42),
 (0.9347401841822083, 30),
 (0.9375193526919786, 6),
 (0.9380836793410617, 24),
 (0.9438240485634083, 29),
 (0.9438526522441384, 33),
 (0.9448974986965711, 1),
 (0.9451534537521332, 31),
 (0.945777795071706, 27),
 (0.9465581953663532, 13),
 (0.9472385587498705, 45),
 (0.9495743997281749, 2),
 (0.9497327886616467, 5),
 (0.9499669994122191, 44),
 (0.950751768678862, 28),
 (0.9521823112364727, 32),
 (0.95235179118

In [47]:
# Nearest Documment to Query Snippet
# pd.set_option('display.max_colwidth', None)

result_nn = result[0][1]
dataset_pcrs[["_id", "title", "text"]].iloc[[result_nn]]["_id"][0]

4983

In [48]:
# pd.set_option('display.max_colwidth', 50)

### Evaluation

In [49]:
# Auxiliar Function to Get ID of nearest documment to specified query.
def get_nearest_id(query, model, data):
    result = sorted(model(query), key=lambda i:i[0])
    result_nn = result[0][1]
    return int(data[["_id"]].iloc[[result_nn]]["_id"])

In [66]:
dataset_pcrs[["_id"]]

Unnamed: 0,_id
0,4983
3,5836
6,7912
9,18670
12,19238
15,33370
18,36474
21,54440
24,70115
27,70490


In [73]:
# used_ids -> flag if we used some batch in dataset_pcrs in development
used_ids = dataset_pcrs["_id"].tolist()
ds_eval = dataset[dataset["_id"].isin(used_ids)].drop(["text"], axis=1).head(50)

In [74]:
ds_eval["ir_v1_nn"] = ds_eval["query"].apply(lambda query: get_nearest_id(query, ir_v1, dataset_pcrs))
ds_eval["ir_v2_nn"] = ds_eval["query"].apply(lambda query: get_nearest_id(query, ir_v2, dataset_pcrs))
ds_eval["ir_v3_nn"] = ds_eval["query"].apply(lambda query: get_nearest_id(query, ir_v3, dataset_pcrs))
ds_eval["ir_v4_nn"] = ds_eval["query"].apply(lambda query: get_nearest_id(query, ir_v4, dataset_pcrs))
#ds_eval["ir_v5_nn"] = ds_eval_v1["query"].apply(lambda i: get_nearest_id(query, ir_v5, dataset_pcrs))

In [75]:
ds_eval.head(5)

Unnamed: 0,_id,title,query,ir_v1_nn,ir_v2_nn,ir_v3_nn,ir_v4_nn
0,4983,Microstructural development of human newborn c...,what is the diffusion coefficient of cerebral ...,4983,4983,4983,4983
1,4983,Microstructural development of human newborn c...,what is diffusion tensor,4983,4983,4983,4983
2,4983,Microstructural development of human newborn c...,what is the diffusion coefficient of the cereb...,4983,4983,4983,4983
3,5836,Induction of myelodysplasia by myeloid-derived...,which type of hematopoiesis is characterized b...,5836,5836,5836,5836
4,5836,Induction of myelodysplasia by myeloid-derived...,which cell types have hematopoiesis,5836,5836,5836,5836


In [81]:
# @TODO: add ir_v5_nn
eval_models = {"model": ["ir_v1_nn","ir_v2_nn","ir_v3_nn","ir_v4_nn"],
               "precision": [], "recall": [], "f-measure": []}

for model in eval_models["model"]:
    
    y_true = list(map(lambda i: int(i), ds_eval["_id"].tolist()))
    y_pred = list(map(lambda i: int(i), ds_eval[model].tolist()))
    
    eval_models["precision"].append(precision_score(y_true, y_pred, average='weighted'))
    eval_models["recall"].append(recall_score(y_true, y_pred, average='weighted'))
    eval_models["f-measure"].append(f1_score(y_true, y_pred, average='weighted'))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [77]:
results = pd.DataFrame(eval_models)

In [84]:
results

Unnamed: 0,model,precision,recall,f-measure
0,ir_v1_nn,1.0,1.0,1.0
1,ir_v2_nn,1.0,1.0,1.0
2,ir_v3_nn,1.0,0.96,0.976
3,ir_v4_nn,1.0,0.96,0.976
