In [1]:
%pip install biopython
%pip install xmltodict
%pip install opensearch-py

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from Bio import Entrez
import json
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification
import torch
import scipy
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
if torch.cuda.is_available():  
  device = "cuda:0" 
else:  
  device = "cpu"
device

'cuda:0'

## Retrieve Data

In [4]:
#Entrez.email = "jonas.gann@gmail.com"
#handle = Entrez.esearch(db="pubmed", term="intelligence[tiab]", retmax="10000")
## More information about search field tags: https://pubmed.ncbi.nlm.nih.gov/help/#using-search-field-tags
#record = Entrez.read(handle)
#id_string = ",".join(record["IdList"])

In [5]:
#handle = Entrez.efetch(db="pubmed", id=id_string, retmode="xml")
## info about rettype and retmode: https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly
#records = Entrez.read(handle)
#f = open("data.json", "w")
#f.write(json.dumps(records))

In [6]:
f = open("data.json", "r")
data = json.loads(f.read())

## Data Preprocessing

In [7]:
data = data["PubmedArticle"]
new_data = []
for idx, article in enumerate(data):
    if (not "Abstract" in article["MedlineCitation"]["Article"].keys()): continue
    article = {
        "id": article["MedlineCitation"]["PMID"],
        "title": article["MedlineCitation"]["Article"]["ArticleTitle"],
        "text": " ".join(article["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]) # some abstracts are split in an array
    }
    new_data.append(article)

## Dataset

In [8]:
class PubMedDataset(Dataset):
    def __init__(self):
        self.data = new_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]["text"]
        return sample

## Generate Embeddings for questions

In [9]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2').to(device)


In [10]:
# with torch.no_grad():
#     inputs = tokenizer([document["text"][:512] for document in new_data], return_tensors="pt", padding=True)
#     outputs = model(**inputs)

In [11]:
dataset = PubMedDataset()
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

In [12]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [13]:
outputs = []
with torch.no_grad():
    for i, sample in enumerate(dataloader):
        inputs = tokenizer(sample, return_tensors="pt", padding=True, truncation=True).to(device)
        out = model(**inputs)
        pooled = mean_pooling(out, inputs["attention_mask"]).to("cpu")
        outputs.extend(pooled)

In [20]:
query = "Imact of alcohol on intelligence of children."
inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(device)
query_outputs = mean_pooling(model(**inputs), inputs["attention_mask"]).to("cpu")

In [21]:
out = torch.stack(outputs)

In [22]:
sim = torch.cosine_similarity(out, query_outputs)
sorted = torch.argsort(sim, descending=True)

In [23]:
new_data[sorted[0]]

{'id': '37974061',
 'title': 'Questioning cognitive heterogeneity and intellectual functioning in fetal alcohol spectrum disorders from the Wechsler intelligence scale for children.',
 'text': '<b>Introduction</b>: Fetal Alcohol Spectrum Disorders (FASD) are characterized by a variety of multiple cognitive and behavioral impairments, with intellectual, attentional, and executive impairments being the most commonly reported. In populations with multiple neurodevelopmental disorders, the Full Scale Intelligence Quotient (FSIQ) may not be a proper measure of intellectual abilities, rarely interpreted in FASD clinical practice because the heterogeneity of the cognitive profile is deemed too strong. We propose a quantitative characterization of this heterogeneity, of the strengths and weaknesses profile, and a differential analysis between global cognitive (FSIQ) and elementary reasoning abilities in a large retrospective monocentric FASD sample. <b>Methods</b>: Using clinical and cognitive