<a href="https://colab.research.google.com/github/J-Gann/QA-INLPT-WS2023/blob/main/INLPT_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from Bio import Entrez
import json
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification, AutoModelForQuestionAnswering
import torch
import scipy
from torch.utils.data import DataLoader
import pickle
from transformers import pipeline
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from opensearchpy import OpenSearch
import requests

2024-01-22 11:58:17.954524: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-22 11:58:18.009852: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-22 11:58:18.295416: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-22 11:58:18.295523: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-22 11:58:18.346474: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [2]:
if torch.cuda.is_available():
  device = "cuda:0"
else:
  device = "cpu"
device

'cpu'

In [3]:
# https://opensearch.org/docs/latest/clients/python-low-level/

host = 'localhost'
port = 9200
auth = ('admin', 'admin')


# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

## Load Data
- only 10.000 results possible this way
- for more results use bash cli: https://www.nlm.nih.gov/dataguide/edirect/install.html#edirect-installation

In [4]:
index_name = 'pub_med_index'
index_body = {  
    'settings': {
    'index': {
      'number_of_shards': 4
    },
      'mappings': {
        # Your index mappings here
    }
  }
}

response = client.indices.create(index_name, body=index_body)

In [5]:
pubmed_data_path = "/home/chris/University/NLP_project/pubmed_data.json"
pubmed_data_preprocessed_path = "/home/chris/University/NLP_project/pubmed_data_preprocessed.json"

if not os.path.exists(pubmed_data_preprocessed_path):
  with open(pubmed_data_path, 'r') as f:
    records = json.loads(f.read())
     
  records = records["PubmedArticle"]
  preprocessed_records = []
  for idx, article in enumerate(records):
      if (not "Abstract" in article["MedlineCitation"]["Article"].keys()): continue
      article = {
          "_id": article["MedlineCitation"]["PMID"],
          "title": article["MedlineCitation"]["Article"]["ArticleTitle"],
          "text": " ".join(article["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]) # some abstracts are split in an array
      }
      response
      preprocessed_records.append(article)
  with open(pubmed_data_preprocessed_path, 'w') as f:
    f.write(json.dumps(preprocessed_records))
else:
    with open(pubmed_data_preprocessed_path, 'r') as f:
        preprocessed_records = json.loads(f.read())
     

In [6]:
class PubMedDataset(Dataset):
    def __init__(self, path):
        with open(path, 'r') as f:
          self.data = json.loads(f.read())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]["text"]
        return sample

## Embedding

In [11]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2').to(device)


In [12]:
preprocessed_records[0:2]

[{'_id': '38085539',
  'title': 'High Seebeck Coefficient Inorganic Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> Core/Polymer Cladding Fibers for Respiration and Body Temperature Monitoring.',
  'text': 'Wearable thermal sensors based on thermoelectric (TE) materials with high sensitivity and temperature resolution are extensively used in medical diagnosis, human-machine interfaces, and advanced artificial intelligence. However, their development is greatly limited by the lack of materials with both a high Seebeck coefficient and superior anticrystallization ability. Here, a new inorganic amorphous TE material, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub>, with a high Seebeck coefficient of 1109 μV/K is reported. Owing to the large difference between the glass-transition temperature and initial crystallization temperature, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> strongly inhibits crystallization during fiber fabrication by thermally codrawing a precast rod comprising a Ge<sub>1

In [7]:
dataset = PubMedDataset(pubmed_data_preprocessed_path)
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

In [10]:
# why not take cls token?
def mean_pooling(last_hidden_state, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    return torch.sum(last_hidden_state * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [13]:
embeddings = []
with torch.no_grad():
    for i, sample in enumerate(dataloader):
        inputs = tokenizer(sample, return_tensors="pt", padding=True, truncation=True).to(device)
        out = model(**inputs)
        pooled = mean_pooling(out.last_hidden_state, inputs["attention_mask"]).to("cpu")
        embeddings.extend(pooled)
embeddings_stacked = torch.stack(embeddings)


In [14]:
torch.save(embeddings_stacked, '/home/chris/University/NLP_project/pubmed_data_embeddings.bin')

In [37]:
# Define an index mapping with a custom analyzer
index_mapping = {
  "settings": {
    "index.knn": True
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "standard"
      },
      "text": {
        "type": "text",
        "analyzer": "standard"
      },
      "vector": {
        "type": "knn_vector",
        "dimension": len(embeddings_stacked[0])  
      }
    }
  }
}


# Create the index with the custom mapping
index_name = "pub_med_index"
client.indices.create(index=index_name, body=index_mapping)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'pub_med_index'}

In [47]:
actions = [ 
    ({"index": {"_index": "pub_med_index", "_id":doc["_id"] }},{"title": doc["title"], "text": doc["text"], "vector": embeddings_stacked[num].tolist() })
for num,doc in enumerate(preprocessed_records[:1000])
]

In [48]:
request = '\n'.join([f'{json.dumps(item, indent=None, separators=(",", ":"))}' for tpl in actions for item in tpl])

In [49]:
try:
    response = client.bulk(body=request, refresh=True)
    print("Bulk request successful.")
except Exception as e:
    print(f"Failed to perform bulk request. Error: {e}")

Bulk request successful.


In [56]:
question = "What is the influence of alcohol on minors?"

inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True).to(device)
query_outputs = mean_pooling(model(**inputs).last_hidden_state, inputs["attention_mask"]).to("cpu")
print(len(query_outputs[0].tolist()))

# Define the KNN search query
knn_query = {
    "size": 5,
    "_source": ["title", "text"],
    "query": {
        "knn": {
            "vector": {
                "vector": query_outputs[0].tolist(),
                "k": 5
            }
        }
    }
}

# Perform the KNN search
response = client.search(index=index_name, body=knn_query)

768


In [57]:
response

{'took': 14,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 5, 'relation': 'eq'},
  'max_score': 0.097037144,
  'hits': [{'_index': 'pub_med_index',
    '_id': '38033004',
    '_score': 0.097037144,
    '_source': {'text': 'Alexithymia can be associated with worse addictive traits, while emotional intelligence is associated with better addictive outcomes. In Lebanon, the prevalence of cigarette and waterpipe smoking is on the rise, although people are aware of the associated harms. Also, around 11% of Lebanese adults have experienced alcohol use disorder (AUD). This study aimed to assess the association between alexithymia, emotional intelligence, smoking (cigarette and waterpipe), and AUD among a sample of Lebanese adults. A web-based cross-sectional study carried out between February and April 2020, during the lockdown period, enrolled 408 community-dwelling adults. The survey link was shared on social media to 

In [61]:
response['hits']['hits'][0]['_source']['text']

'Alexithymia can be associated with worse addictive traits, while emotional intelligence is associated with better addictive outcomes. In Lebanon, the prevalence of cigarette and waterpipe smoking is on the rise, although people are aware of the associated harms. Also, around 11% of Lebanese adults have experienced alcohol use disorder (AUD). This study aimed to assess the association between alexithymia, emotional intelligence, smoking (cigarette and waterpipe), and AUD among a sample of Lebanese adults. A web-based cross-sectional study carried out between February and April 2020, during the lockdown period, enrolled 408 community-dwelling adults. The survey link was shared on social media to reach participants from all Lebanese districts/governorates. Taking antidepressants (Beta = 4.37) was significantly associated with more cigarette dependence, while female gender (Beta = -1.52) and having a high vs. low monthly income (Beta = 1.02) were significantly associated with less cigaret

In [62]:
response['hits']['hits'][1]['_source']['text']

'To evaluate whether prenatal tobacco exposure (PTE) is related to poorer cognitive performance, abnormal brain morphometry, and whether poor cognitive performance is mediated by PTE-related structural brain differences. The Adolescent Brain Cognitive Development study dataset was used to compare structural MRI data and neurocognitive (NIH Toolbox<sup>®</sup>) scores in 9-to-10-year-old children with (n=620) and without PTE (n=10,989). We also evaluated whether PTE effects on brain morphometry mediated PTE effects on neurocognitive scores. Group effects were evaluated using Linear Mixed Models, covaried for socio-demographics and prenatal exposures to alcohol and/or marijuana, and corrected for multiple comparisons using the false-discovery rate (FDR). Compared to unexposed children, those with PTE had poorer performance (all p-values <0.05) on executive function, working memory, episodic memory, reading decoding, crystallized intelligence, fluid intelligence and overall cognition. Exp

In [63]:
response['hits']['hits'][2]['_source']['text']

'The aims of this review are to provide a comprehensive overview of the definition and scope of pharmacoepidemiology, to summarize the study designs and methodologies used in the field, to discuss the future trends in the field and new methodologies to address bias and confounding, and finally to give some recommendations to clinicians interested in pharmacoepidemiologic research. Because drug efficacy and safety from randomized clinical trials do not reflect the real-world situation, pharmacoepidemiological studies on drug safety monitoring and drug effectiveness in large numbers of people are needed by healthcare professionals and regulatory institutions. We aim to highlight the importance of pharmacoepidemiologic research in informing evidence-based medicine and public health policy. The development of new designs and methodologies for the generation of valid evidence, as well as new initiatives to provide guidance and recommendations on how to incorporate real-world evidence into t

In [64]:
response['hits']['hits'][3]['_source']['text']

"Compare by occurrence-era and age-group how opioid-related deaths (ORDs) and their counterpart evolved in Scotland versus England and Wales during 2006-2020. For Scotland, compare co-implication rates between ORDs and non-ORDs for any benzodiazepine; cocaine; gabapentin/pregabalin; and consider whether co-implication in ORDs depended on opioid-specificity. Cross-tabulations of drug misuse deaths (DMDs) obtained by 3-yearly occurrence-era (2006-2008 to 2018-2020) and age-group (under 25, 25-34, 35-44, 45-54, 55+ years) for England and Wales and subdivided by whether at least one opiate was mentioned on death certificate (DMD-Os or not); and of Scotland's opioid-related deaths (ORDs versus non-ORDs) together with i) co-implication by any benzodiazepine; or cocaine; gabapentin/pregabalin and ii) opioid-specificity of ORDs. ORD is defined by heroin/morphine (H) or methadone (M) or buprenorphine (B) being implicated in DMD. Per era between 2012-2014 and 2018-2020, Scotland's ORDs increased

In [65]:
response['hits']['hits'][4]['_source']['text']

"The knowledge of the effects of organophosphate flame retardants on children's neurodevelopment is limited. The purpose of the present research is to evaluate the association between exposure to organophosphate flame retardants and children's neurodevelopment in two European cohorts involved in the Human Biomonitoring Initiative Aligned Studies. The participants were school-aged children belonging to the Odense Child Cohort (Denmark) and the PCB cohort (Slovakia). In each cohort, the children's neurodevelopment was assessed through the Full-Scale Intelligence Quotient score of the Wechsler Intelligence Scale for Children, using two different editions. The children's urine samples, collected at one point in time, were analyzed for several metabolites of organophosphate flame retardants. The association between neurodevelopment and each organophosphate flame retardant metabolite was explored by applying separate multiple linear regressions based on the approach of MM-estimation in each 

In [66]:

pipe_qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [67]:
text = response['hits']['hits'][0]['_source']['text']
context = summarizer(text, max_length=100, min_length=50, do_sample=False)[0]["summary_text"]


In [68]:
pipe_qa({"context": context, "question": question})

{'score': 0.0009504778427071869,
 'start': 35,
 'end': 57,
 'answer': 'worse addictive traits'}