In [None]:
import torch
from transformers import AutoTokenizer
from splade.models.transformer_rep import Splade

import pandas as pd
import os
import ast

# Mise en place

## Init

In [2]:
model_type_or_dir = "naver/splade-cocondenser-ensembledistil"

In [None]:
# loading model and tokenizer

model = Splade(model_type_or_dir, agg="max")
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_type_or_dir)
reverse_voc = {v: k for k, v in tokenizer.vocab.items()}

In [4]:
def get_splade_bow(doc, top=None):
    # now compute the document representation
    with torch.no_grad():
        doc_rep = model(d_kwargs=tokenizer(doc, return_tensors="pt"))["d_rep"].squeeze()  # (sparse) doc rep in voc space, shape (30522,)

    # get the number of non-zero dimensions in the rep:
    col = torch.nonzero(doc_rep).squeeze().cpu().tolist()
    #print("number of actual dimensions: ", len(col))

    # now let's inspect the bow representation:
    weights = doc_rep[col].cpu().tolist()
    d = {k: v for k, v in zip(col, weights)}
    sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
    
    bow_rep = []
    
    sorted_d_items = list(sorted_d.items())[:top] if top is not None else sorted_d.items()
    
    for k, v in sorted_d_items:
        bow_rep.append((reverse_voc[k], round(v, 2)))
    return bow_rep

def get_next_phrase(text:str, idx_start=None):
    idx_point = text.find(".", idx_start)
    
    if idx_start is None:
        idx_start = 0
    return text[idx_start:idx_point+1], idx_point+1 

def diviser_passages(text_fr) -> list[str]:
    passages = []
    indice = 0

    while indice < len(text_fr):
        
        passage = str()
        while(len(passage) < 512) and indice < len(text_fr):
        
            indice_precedent = indice

            phrase, indice = get_next_phrase(text_fr, indice)
            if len(passage) + len(phrase) < 512: 
                passage += " " + phrase
            else:
                indice = indice_precedent
                break
        passages.append(passage)

    return passages

In [5]:
if not os.getcwd().endswith("\\_data_mounir\\PDFs"):
    os.chdir("_data_mounir\\PDFs")

## Vectoriser les documents

In [62]:
def vectorize(src_path, dest_path):
    if not os.path.exists(dest_path):
        os.mkdir(dest_path)
    
    for filename in os.listdir(src_path):
        with open (os.path.join(src_path, filename), "r") as f:
            text_eng = f.read()
        
        text_eng = text_eng.replace("\n", " ")
        doc_split2 = diviser_passages(text_eng)
        
        bows = []
        for passage in doc_split2:
            bows.append(get_splade_bow(passage, top=50))
            
        df = pd.DataFrame(zip(doc_split2, bows), columns=["text", "bow_rep"])
        #df.text = df.text.apply(lambda x: x.replace("\n"," "))
        
        ##
        bow_rep_all = set()
        for bow in df.bow_rep:
            bow_rep_all = bow_rep_all.union(bow)
            
        # Regrouper les termes identiques et considérer uniquement l'instance du terme avec le plus grand score
        df_all = pd.DataFrame(bow_rep_all, columns=["term", "weight"])
        idx = df_all.groupby('term')['weight'].idxmax()
        # Sélectionner les lignes correspondantes dans le DataFrame original
        df_clean = df_all.loc[idx].reset_index(drop=True)
        df_clean.to_csv(os.path.join(dest_path, filename[:-4]+"-all.csv"))
        

In [None]:
vectorize("Abstracts", "Vectors_all_2")

## Vectoriser une requete

In [64]:
def retrieve(query, path_vectors="Vectors_all"):
    query_vectorized = get_splade_bow(query, top=50)
    df_query = pd.DataFrame(query_vectorized, columns=["term", "weight"])
    scores_files = []
    
    for file in os.listdir(path_vectors): 
        score_doc = 0
        df = pd.read_csv(os.path.join(path_vectors, file))
        # Transformer le DataFrame en dictionnaire avec les colonnes
        dict = df.set_index('term')['weight'].to_dict()
        
        for term, weight in df_query.itertuples(index=False):
            score_mot_doc = dict.get(term, 0)
            score_doc += score_mot_doc*weight
        scores_files.append((file, score_doc))
    sorted_scores = sorted(scores_files, key=lambda x: x[1], reverse=True)
    
    return sorted_scores, df_query


# Tests

In [43]:
query = "singular value decomposition"

scores, df_query = retrieve(query)
scores

[('On the use of Singular Value Decomposition for Text Retrieval-vectorized2-all.csv',
  23.615099999999998),
 ('Colbert V2 - Effective and Efficient Retrieval via Lightweight Late Interaction-vectorized2-all.csv',
  8.3425),
 ('SPLADE Sparse Lexical and Expansion Model for First Stage Ranking-vectorized2-all.csv',
  0.5903999999999999),
 ('Distributed representations of words and phrases-vectorized2-all.csv',
  0.582),
 ('Document vectorization Method using network information of words-vectorized2-all.csv',
  0.5409),
 ('SparTerm - Learning Term-based Sparse Representation for Fast Text Retrieval-vectorized2-all.csv',
  0.4419),
 ('DocBERT BERT for Document Classification-vectorized2-all.csv',
  0.39050000000000007),
 ('LIMICS@DEFT’24 - Un mini-LLM peut-il tricher aux QCM de pharmacie en fouillant dans Wikipédia et NACHOS-vectorized2-all.csv',
  0.14),
 ('From doc2vec to advanced keyword queries_ searching for phenotypes in large clinical document databases-vectorized2-all.csv',
  0.1

In [44]:
query = "medical term normalization, cross-lingual, medical term representation, knowledge graph embedding, contrastive learning"
# Mots clés provenant du papier de CODER
scores, df_query = retrieve(query)
scores



[('CODER - Knowledge infused cross-lingual medical term embedding for term normalization-vectorized2-all.csv',
  41.6353),
 ('SPLADE Sparse Lexical and Expansion Model for First Stage Ranking-vectorized2-all.csv',
  15.2043),
 ('From doc2vec to advanced keyword queries_ searching for phenotypes in large clinical document databases-vectorized2-all.csv',
  13.940400000000002),
 ('SPLADE v2 Sparse Lexical and Expansion Model for Information Retrieval-vectorized2-all.csv',
  11.0157),
 ('SparTerm - Learning Term-based Sparse Representation for Fast Text Retrieval-vectorized2-all.csv',
  7.917299999999999),
 ('The Unifed Medical Language System - Integrating biomedical terminology-vectorized2-all.csv',
  7.642500000000001),
 ('Document vectorization Method using network information of words-vectorized2-all.csv',
  5.959899999999999),
 ('Distributed representations of words and phrases-vectorized2-all.csv',
  5.4123),
 ('Colbert V2 - Effective and Efficient Retrieval via Lightweight Late Int

In [45]:
query = "document retrieval, medical informatics, clinical phenotypes"
# Mots clés provenant du papier de from doc2vec ...
scores, df_query = retrieve(query)
scores



[('From doc2vec to advanced keyword queries_ searching for phenotypes in large clinical document databases-vectorized2-all.csv',
  17.9271),
 ('Automatic Cohort Retrieval-vectorized2-all.csv', 11.770499999999997),
 ('SPLADE v2 Sparse Lexical and Expansion Model for Information Retrieval-vectorized2-all.csv',
  11.262099999999998),
 ('Colbert V2 - Effective and Efficient Retrieval via Lightweight Late Interaction-vectorized2-all.csv',
  10.0911),
 ('SPLADE Sparse Lexical and Expansion Model for First Stage Ranking-vectorized2-all.csv',
  9.7262),
 ('On the use of Singular Value Decomposition for Text Retrieval-vectorized2-all.csv',
  9.458600000000002),
 ('The Unifed Medical Language System - Integrating biomedical terminology-vectorized2-all.csv',
  6.6984),
 ('LIMICS@DEFT’24 - Un mini-LLM peut-il tricher aux QCM de pharmacie en fouillant dans Wikipédia et NACHOS-vectorized2-all.csv',
  6.2185999999999995),
 ('SparTerm - Learning Term-based Sparse Representation for Fast Text Retrieval

In [46]:
query = "DEFT, LLM, RAG, prompt, LoRA, Apollo."
# Mots clés provenant du papier de LIMICS@DEFT2021
scores, df_query = retrieve(query)
scores



[('LIMICS@DEFT’24 - Un mini-LLM peut-il tricher aux QCM de pharmacie en fouillant dans Wikipédia et NACHOS-vectorized2-all.csv',
  22.344899999999996),
 ('Retrieval Augmented Generation for LLMs - A survey-vectorized2-all.csv',
  11.856800000000002),
 ('Automatic Cohort Retrieval-vectorized2-all.csv', 4.7299999999999995),
 ('The Unifed Medical Language System - Integrating biomedical terminology-vectorized2-all.csv',
  2.4709000000000003),
 ('SparTerm - Learning Term-based Sparse Representation for Fast Text Retrieval-vectorized2-all.csv',
  2.4185),
 ('From doc2vec to advanced keyword queries_ searching for phenotypes in large clinical document databases-vectorized2-all.csv',
  2.2472),
 ('Distributed representations of words and phrases-vectorized2-all.csv',
  1.9546),
 ('Document vectorization Method using network information of words-vectorized2-all.csv',
  1.5811),
 ('On the use of Singular Value Decomposition for Text Retrieval-vectorized2-all.csv',
  1.2644),
 ('CODER - Knowledg

In [47]:
query = "Large language model, retrieval-augmented generation, natural language processing, information retrieval"
# Mots clés provenant du papier de RAG for LLMS a Survey
scores, df_query = retrieve(query)
scores



[('Retrieval Augmented Generation for LLMs - A survey-vectorized2-all.csv',
  23.8007),
 ('LIMICS@DEFT’24 - Un mini-LLM peut-il tricher aux QCM de pharmacie en fouillant dans Wikipédia et NACHOS-vectorized2-all.csv',
  17.0502),
 ('From doc2vec to advanced keyword queries_ searching for phenotypes in large clinical document databases-vectorized2-all.csv',
  14.702199999999996),
 ('Colbert V2 - Effective and Efficient Retrieval via Lightweight Late Interaction-vectorized2-all.csv',
  14.3023),
 ('SPLADE v2 Sparse Lexical and Expansion Model for Information Retrieval-vectorized2-all.csv',
  11.6194),
 ('SparTerm - Learning Term-based Sparse Representation for Fast Text Retrieval-vectorized2-all.csv',
  11.0844),
 ('On the use of Singular Value Decomposition for Text Retrieval-vectorized2-all.csv',
  10.5338),
 ('Automatic Cohort Retrieval-vectorized2-all.csv', 10.289600000000002),
 ('SPLADE Sparse Lexical and Expansion Model for First Stage Ranking-vectorized2-all.csv',
  9.6597999999999

In [49]:
query = "Fast Retrieval, Sparse Representation, BERT"
# Mots clés provenant du papier de SparTERM
scores, df_query = retrieve(query)
scores



[('SparTerm - Learning Term-based Sparse Representation for Fast Text Retrieval-vectorized2-all.csv',
  16.1892),
 ('SPLADE Sparse Lexical and Expansion Model for First Stage Ranking-vectorized2-all.csv',
  15.787800000000002),
 ('SPLADE v2 Sparse Lexical and Expansion Model for Information Retrieval-vectorized2-all.csv',
  15.038),
 ('On the use of Singular Value Decomposition for Text Retrieval-vectorized2-all.csv',
  9.373000000000001),
 ('Colbert V2 - Effective and Efficient Retrieval via Lightweight Late Interaction-vectorized2-all.csv',
  9.311600000000002),
 ('DocBERT BERT for Document Classification-vectorized2-all.csv',
  7.7642999999999995),
 ('Automatic Cohort Retrieval-vectorized2-all.csv', 6.3988),
 ('From doc2vec to advanced keyword queries_ searching for phenotypes in large clinical document databases-vectorized2-all.csv',
  6.120400000000001),
 ('LIMICS@DEFT’24 - Un mini-LLM peut-il tricher aux QCM de pharmacie en fouillant dans Wikipédia et NACHOS-vectorized2-all.csv',

In [50]:
query = "neural networks, indexing, sparse representations, regularization"
# Mots clés provenant du papier de SPLADE et SPLADE 2   
scores, df_query = retrieve(query)
scores



[('SPLADE Sparse Lexical and Expansion Model for First Stage Ranking-vectorized2-all.csv',
  18.025399999999998),
 ('SPLADE v2 Sparse Lexical and Expansion Model for Information Retrieval-vectorized2-all.csv',
  14.2064),
 ('SparTerm - Learning Term-based Sparse Representation for Fast Text Retrieval-vectorized2-all.csv',
  8.883399999999998),
 ('Colbert V2 - Effective and Efficient Retrieval via Lightweight Late Interaction-vectorized2-all.csv',
  7.3327),
 ('Document vectorization Method using network information of words-vectorized2-all.csv',
  3.7853),
 ('Distributed representations of words and phrases-vectorized2-all.csv',
  3.6971),
 ('CODER - Knowledge infused cross-lingual medical term embedding for term normalization-vectorized2-all.csv',
  3.4678),
 ('From doc2vec to advanced keyword queries_ searching for phenotypes in large clinical document databases-vectorized2-all.csv',
  2.9975000000000005),
 ('On the use of Singular Value Decomposition for Text Retrieval-vectorized2-a