In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 67.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 75.6 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 53.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  At

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import torch.nn.functional as F
import requests
from bs4 import BeautifulSoup
import requests
import json
import time

In [None]:
class QuestionAnswer():

  def __init__(self):
    self.tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    self.model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

  def answer(self, text, question):
    '''
    Takes in a text as a list research paper and a question
    output question, answer from the paper based on the question, score(confidence of value)
    '''

    inputs = self.tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = self.tokenizer.convert_ids_to_tokens(input_ids)

    answer_start_scores, answer_end_scores = self.model(**inputs, return_dict = False)
      
    answer_start_scores =  F.softmax(answer_start_scores, dim = 1)
    answer_end_scores = F.softmax(answer_end_scores, dim = 1)

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
      
      
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

     #normalize score probabily of the start and end
    score = (torch.max(answer_start_scores).detach()**2 + torch.max(answer_end_scores).detach()**2)/(answer_start_scores.size(1) * 2)
      
    answer = self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer, score

In [None]:
##Web scraping and making API requests
def makeRequest(num_req):
  
  web_path = f'https://ascopubs.org/action/doSearch?AllField=radiotherapy%2C+follow-up%2C+survivors&target=default&pageSize={num_req}&startPage=0'

  res = requests.get(web_path)
  soup = BeautifulSoup(res.content, "html.parser")
  data = soup.find_all("a", class_="ref nowrap")
  for d in data:
    link = 'https://ascopubs.org' + d['href']
    resp = requests.get(link)
    soup2 = BeautifulSoup(resp.content, "html.parser")
    data = soup2.find_all("div", class_="NLM_sec NLM_sec_level_1")
    content = []
    for about in data:
      
      try:
        headings = about.find("div", class_="sectionHeading")
        paragraph = about.find("p")
        if len(paragraph.text.split(' ')) < 512:
          content.append(paragraph.text)
      except:
        print('Just minor error')
    yield content



In [None]:
#9739161f3bea474f9bd7edfc8a2c1b38
def analyzeMedicalDoc(documents):
  '''Provide your Api Key'''
  Ocp-Apim-Subscription-Key = " xxxxxxxxxxxxxxxxxxxxxxxx"
  header = {"Ocp-Apim-Subscription-Key": Ocp-Apim-Subscription-Key, "Content-Type": "application/json", "Accept":"application/json"}
  doc_file = []
  for i,doc in enumerate(documents):
    doc = {"language": "en", "id": f"{i+1}", "text":f"{doc}"}
    doc_file.append(doc)
  
  
  param ={
    "documents": doc_file
  }

  param = json.dumps(param, indent = 4)

  max_tries = 0
  
  resp = requests.post('https://jachanya.cognitiveservices.azure.com/text/analytics/v3.1/entities/health/jobs', headers = header, data = param)
  
  

  """get the response from the post"""
  jobId = resp.headers['apim-request-id']
  operation_location = resp.headers['operation-location']

  params = {"jobId":f"{jobId}"}
  headers = {"Ocp-Apim-Subscription-Key": Ocp-Apim-Subscription-Key, "Content-Type": "application/json"}
  max_tries = 0
  while max_tries < 20:

    res = requests.get(f'{operation_location}', headers = headers, params = params)
    if res.json()['status'] == 'succeeded':
      break
    max_tries +=1
  
  is_delete = requests.delete(f'{operation_location}', headers = headers, params = params)
  return res.json()

In [None]:
def getSymDiagTr(documents):
  """Get symptoms diagnosis of a paper"""
  
  #print(json.dumps(resp, indent = 4))
  try:
    resp = analyzeMedicalDoc(documents)
    if resp['status'] == 'succeeded':
      symptomsnsigns = set()
      treatmentname = set()
      diagnosis = set()
      for x in resp["results"]["documents"]:
        for i,entity in enumerate(x["entities"]):
          
          if entity['category'] == 'Diagnosis':
            
            if entity.get('name', "") != "":
              diagnosis.add(entity.get('name', ""))
            else:
              diagnosis.add(entity['text'])

          
          if entity['category'] == 'SymptomOrSign':
            
            if entity.get('name', "") != "":
              symptomsnsigns.add(entity.get('name', ""))
            else:
              symptomsnsigns.add(entity['text'])

          
          if entity['category'] == 'TreatmentName':
            
            if entity.get('name', "") != "":
              treatmentname.add(entity.get('name', ""))
            else:
              treatmentname.add(entity['text'])
      return symptomsnsigns,diagnosis,treatmentname
  except:
    print('Some random error')
  return None

In [None]:
researcher = QuestionAnswer()

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

In [None]:
def find_similarity(patient, nbr_papers):
  current_paper = 'xxx'
  similarity = torch.zeros(3, nbr_papers)
  counter = 0
  max_x = 0
  max_y = 0
  max_z = 0
  for texts in makeRequest(nbr_papers):
    if len(texts) != 0:
      try:
        result = getSymDiagTr(texts)
        if result is not None:
          
          sym, diag, treat = result
          #print(len(set(patient.symptoms).intersection(sym)))

          if len(sym) > max_x: max_x = len(sym)
          if len(diag) > max_y: max_y = len(diag)
          if len(treat) > max_z: max_z = len(treat)


          similarity[0, counter] = len(set(patient.symptoms).intersection(sym)) * len(sym)
          similarity[1, counter] = len(set(patient.diagnosis).intersection(diag)) * len(diag)
          similarity[2, counter] = len(set(patient.treatment).intersection(treat)) * len(treat)
      
      except:
        raise
      counter = counter + 1

    similarity[0, :] /= max_x
    similarity[1, :] /= max_y
    similarity[2, :] /= max_z

  return torch.argmax(torch.sum(similarity, dim = 0))

In [None]:
class Patient():
  def __init__(self,name, id):
    self.symptoms = []
    self.diagnosis = []
    self.treatment = []
  
  def add_symptoms(self, symp):
    self.symptoms.append(symp)
  
  def add_diagnosis(self, diagnosis):
    self.diagnosis.append(diagnosis)

  def add_treatment(self, treatment):
    self.treatment.append(treatment)


In [None]:
name = 'Musa'
id = '1029r387ufhwks-33rt'


pa = Patient(name, id)
pa.add_diagnosis('Cessation of life')
pa.add_symptoms('risk factors')
pa.add_treatment('Therapeutic radiology procedure')
max_sc = find_similarity(pa,30)

Some random error
Some random error
Some random error


In [None]:
def getMostRelatedPaper(num_req):
  
  web_path = f'https://ascopubs.org/action/doSearch?AllField=radiotherapy%2C+follow-up%2C+survivors&target=default&pageSize={num_req}&startPage=0'

  link = ''
  res = requests.get(web_path)
  soup = BeautifulSoup(res.content, "html.parser")
  data = soup.find_all("a", class_="ref nowrap")
  for d in data:
    link = 'https://ascopubs.org' + d['href']
  resp = requests.get(link)
  soup2 = BeautifulSoup(resp.content, "html.parser")
  data = soup2.find_all("div", class_="NLM_sec NLM_sec_level_1")
  content = []
  for about in data:
      
    try:
      headings = about.find("div", class_="sectionHeading")
      paragraph = about.find("p")
      if len(paragraph.text.split(' ')) < 512:
        content.append(paragraph.text)
    except:
      print('Just minor error')
  return content

In [None]:
def docQueryPaper(questions, max_sc):
  qa = QuestionAnswer()
  contents = getMostRelatedPaper(max_sc)
  for question in questions:
    for content in contents:
      answer, score = qa.answer(content, question)
      print(question+ ' ' +answer)


In [None]:
docQueryPaper(['what is this paper about?'], max_sc)

what is this paper about? the volume of breast tissue exposed to radiation and the influence of gonadotoxic chemotherapy ( ct )
what is this paper about? we compared the incidence of bc with that in the general population
what is this paper about? [CLS] what is this paper about? [SEP]
what is this paper about? shorter duration of intact ovarian function after irradiation is associated with a significant reduction of the risk for bc
what is this paper about? the author ( s ) indicated no potential conflicts of interest
what is this paper about? conception and design
