<a href="https://colab.research.google.com/github/J-Gann/QA-INLPT-WS2023/blob/main/INLPT_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install biopython
%pip install sentencepiece
%pip install accelerate

Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/3.1 MB[0m [31m1.6 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/3.1 MB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m34.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from Bio import Entrez
import json
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification, AutoModelForQuestionAnswering
import torch
import scipy
from torch.utils.data import DataLoader
import pickle
from transformers import pipeline
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
import sentencepiece
import accelerate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
if torch.cuda.is_available():
  device = "cuda:0"
else:
  device = "cpu"
device

'cuda:0'

## Load Data

In [None]:
if not os.path.exists('/content/drive/MyDrive/Colab Data/pubmed_data.json'):
  Entrez.email = "jonas.gann@gmail.com"
  handle = Entrez.esearch(db="pubmed", term="intelligence[tiab]", retmax="10000")
  # More information about search field tags: https://pubmed.ncbi.nlm.nih.gov/help/#using-search-field-tags
  record = Entrez.read(handle)
  id_string = ",".join(record["IdList"])
  handle = Entrez.efetch(db="pubmed", id=id_string, retmode="xml")
  # info about rettype and retmode: https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly
  records = Entrez.read(handle)
  with open('/content/drive/MyDrive/Colab Data/pubmed_data.json', 'w') as f:
    f.write(json.dumps(records))
else:
  with open('/content/drive/MyDrive/Colab Data/pubmed_data.json', 'r') as f:
    records = json.loads(f.read())

## Preprocessing

In [None]:
if not os.path.exists('/content/drive/MyDrive/Colab Data/pubmed_data_preprocessed.json'):
  records = records["PubmedArticle"]
  preprocessed_records = []
  for idx, article in enumerate(records):
      if (not "Abstract" in article["MedlineCitation"]["Article"].keys()): continue
      article = {
          "id": article["MedlineCitation"]["PMID"],
          "title": article["MedlineCitation"]["Article"]["ArticleTitle"],
          "text": " ".join(article["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]) # some abstracts are split in an array
      }
      preprocessed_records.append(article)
  with open('/content/drive/MyDrive/Colab Data/pubmed_data_preprocessed.json', 'w') as f:
    f.write(json.dumps(preprocessed_records))
else:
    with open('/content/drive/MyDrive/Colab Data/pubmed_data_preprocessed.json', 'r') as f:
        preprocessed_records = json.loads(f.read())

In [None]:
class PubMedDataset(Dataset):
    def __init__(self, path):
        with open(path, 'r') as f:
          self.data = json.loads(f.read())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]["text"]
        return sample

## Embedding

In [None]:
if not os.path.exists('/content/drive/MyDrive/Colab Data/sentence-transformers/all-mpnet-base-v2_tokenizer'):
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
  tokenizer.save_pretrained('/content/drive/MyDrive/Colab Data/sentence-transformers/all-mpnet-base-v2_tokenizer')
else:
  tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Colab Data/sentence-transformers/all-mpnet-base-v2_tokenizer')

if not os.path.exists('/content/drive/MyDrive/Colab Data/sentence-transformers/all-mpnet-base-v2_model'):
  model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2').to(device)
  model.save_pretrained('/content/drive/MyDrive/Colab Data/sentence-transformers/all-mpnet-base-v2_model')
else:
  model = AutoModel.from_pretrained('/content/drive/MyDrive/Colab Data/sentence-transformers/all-mpnet-base-v2_model').to(device)

In [None]:
dataset = PubMedDataset('/content/drive/MyDrive/Colab Data/pubmed_data_preprocessed.json')
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

In [None]:
# why not take cls token?
def mean_pooling(last_hidden_state, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    return torch.sum(last_hidden_state * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
if not os.path.exists('/content/drive/MyDrive/Colab Data/pubmed_data_embeddings.bin'):
    embeddings = []
    with torch.no_grad():
        for i, sample in enumerate(dataloader):
            inputs = tokenizer(sample, return_tensors="pt", padding=True, truncation=True).to(device)
            out = model(**inputs)
            pooled = mean_pooling(out.last_hidden_state, inputs["attention_mask"]).to("cpu")
            embeddings.extend(pooled)
    embeddings_stacked = torch.stack(embeddings)
    torch.save(embeddings_stacked, '/content/drive/MyDrive/Colab Data/pubmed_data_embeddings.bin')
else:
    embeddings_stacked = torch.load('/content/drive/MyDrive/Colab Data/pubmed_data_embeddings.bin')

## Question Answering
1. summarize relevant papers
2. answer question

In [None]:
pipe_qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
question = "What is the influence of alcohol on minors?"

inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True).to(device)
query_outputs = mean_pooling(model(**inputs).last_hidden_state, inputs["attention_mask"]).to("cpu")

sim = torch.cosine_similarity(embeddings_stacked, query_outputs)
sorted = torch.argsort(sim, descending=True)

context = ""
for i in range(1):
    index = sorted[i]
    text = preprocessed_records[index]["text"]
    context += summarizer(text, max_length=100, min_length=50, do_sample=False)[0]["summary_text"]

pipe_qa({"context": context, "question": question})

{'score': 0.2534666955471039,
 'start': 214,
 'end': 254,
 'answer': 'harms their health and academic progress'}

## Answer Extraction
1. find sentences similar to question
2. summarize similar sentences => answer


In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [None]:
question = "What is the influence of alcohol on minors?"

inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True).to(device)
query_outputs = mean_pooling(model(**inputs).last_hidden_state, inputs["attention_mask"]).to("cpu")

sim = torch.cosine_similarity(embeddings_stacked, query_outputs)
sorted = torch.argsort(sim, descending=True)

sentences = []
for i in range(3):
    index = sorted[i]
    text = preprocessed_records[index]["text"]
    sentences.extend(text.split(". "))

sentences_tokens = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to(device)

out = model(**sentences_tokens)
embeddings = mean_pooling(out.last_hidden_state, sentences_tokens["attention_mask"]).to("cpu")

sim = torch.cosine_similarity(embeddings, query_outputs)
sorted = torch.argsort(sim, descending=True)

relevant_text = ""
for i in range(3):
    index = sorted[i]
    relevant_text += sentences[index] + ". "

summary = summarizer(relevant_text, max_length=50, min_length=20, do_sample=False)[0]["summary_text"]
summary

'Adolescence is a peak period for substance use initiation and a critical time for preventing substance use problems. Currently, a proportion of adolescents use alcohol, tobacco, and illicit drugs, which inevitably harms their health and academic progress.'

## Natural Question Answering
- see: https://github.com/facebookresearch/atlas?tab=readme-ov-file#base-task
- NQ-finetuned-Atlas-base


In [None]:
#  TODO