In [None]:
!pip install sentencepiece
!pip install tokenizers
!pip install transformers

import torch.nn as nn
import json
import torch
import pandas as pd
import numpy as np
import transformers
from transformers import T5ForConditionalGeneration , T5Tokenizer  , AdamW
from pathlib import Path
from torch.utils.data import Dataset , DataLoader

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.2 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96
Collecting tokenizers
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.2 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.3
Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.1 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |██████████████████

In [None]:
from google.colab import drive
drive.mount('/content/drive') #this part has to be commented out if used on cluster
!nvidia-smi


Mounted at /content/drive
Wed Sep  8 11:47:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P8    35W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------------------------------

In [None]:
N_EPOCHS=25
accumulation_steps=5
NUM_EPOCHS=N_EPOCHS
BATCH_SIZE=8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL_NAME="t5-small"
tokenizer=T5Tokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

In [None]:
with Path("/content/drive/MyDrive/reranker.json").open() as json_file: #this path has to be adjusted to local if used on the cluster
  data=json.load(json_file)
  
data[0]

{'Answer': 'physician assistants are medical providers who are licensed to diagnose and treat illness and disease and to prescribe medication for patients',
 'Passage': 'The Physician Assistant Board or their representative may require proof or demonstration of competence from any physician assistant for any medical services performed. If a physician assistant determines a task, procedure or diagnostic problem exceeds his or her level of competence, then the physician assistant shall either consult with a physician or refer such cases to a physician. Click here to review a sample Delegation of Services Agreement. Question: What if a physician assistant works for more than one supervising physician at a hospital or clinic? Do we need to have separate DSAs for each supervising physician? Answer: The Board has had questions regarding how the DSA would be written if a physician assistant works for more than one supervising physician at a hospital or clinic. If the duties and medical servic

In [None]:
def extract_qa( path_1 : Path):
  with path_1.open() as json_file:
    data=json.load(json_file)
  data_rows=[]
  for element in data:
    question=element["Query"]
    passage=element["Passage"]
    answer=element["Answer"]
    ID=element["id"]

    data_rows.append({
        "question":question,
        "passage":passage,
        "answer":answer,
        "model_answer":'',
        "id":ID

    })

  return pd.DataFrame(data_rows)
      

train_df=extract_qa(Path("/content/drive/MyDrive/reranker.json"))

element=train_df.iloc[0]
element["question"]+" "+element['passage']

"What is a physician's assistant? The Physician Assistant Board or their representative may require proof or demonstration of competence from any physician assistant for any medical services performed. If a physician assistant determines a task, procedure or diagnostic problem exceeds his or her level of competence, then the physician assistant shall either consult with a physician or refer such cases to a physician. Click here to review a sample Delegation of Services Agreement. Question: What if a physician assistant works for more than one supervising physician at a hospital or clinic? Do we need to have separate DSAs for each supervising physician? Answer: The Board has had questions regarding how the DSA would be written if a physician assistant works for more than one supervising physician at a hospital or clinic. If the duties and medical services performed are consistent with each supervising physician, then one DSA can be written to include several supervising physicians. Each

In [None]:
class QADataset(Dataset):
  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: T5Tokenizer,
      source_max_token_len: int=396,
      target_max_token_len: int=32,
  ):

      self.tokenizer=tokenizer,
      self.data=data
      self.source_max_token_len=source_max_token_len
      self.target_max_token_len=target_max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index:int):

    ##DATA ROW
    data_row=self.data.iloc[index]


    ##SOURCE ENCODING
    source_encoding=tokenizer(
    data_row["question"],
    data_row["passage"],
    max_length=self.source_max_token_len,
    padding="max_length",
    truncation=True, 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )

    ##TARGET ENCODING
    target_encoding=tokenizer(
    data_row["answer"],
    max_length=self.target_max_token_len,
    padding="max_length",
    truncation=True, 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )

    labels=target_encoding["input_ids"]
    labels[labels==0]=-100


    return dict(
        # question=data_row["question"],
        # passage=data_row["passage"],
        # answer=data_row["answer"],
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"],
        labels=labels,
    )


In [None]:
class QAmodel(nn.Module):

  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME , return_dict=True).to(device)
    

  def forward(self , input_ids , attention_mask , labels ):

    output=self.model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels=labels
    )   

    
    return output


  def configure_optimizers(self):
    return AdamW(self.parameters(),lr=0.0001)



In [None]:

dataset=QADataset(train_df,tokenizer,)
print(dataset.__len__())
data_module = DataLoader(dataset, batch_size=BATCH_SIZE,shuffle="True")
for batch in data_module:
    print(torch.squeeze(batch["labels"]).shape)
    print(batch["input_ids"].shape)
    print(batch["attention_mask"].shape)

2020
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])
torch.Size([8, 1, 396])
torch.Size([8, 32])
torch.Size([8, 1, 396])

In [None]:
def model_training(data_module):
  T5=QAmodel()


  T5=T5.to(device)
  optimizer=T5.configure_optimizers()

  training_loss=[]
  _validation_loss=[]
  
  optimizer.zero_grad()
  for i in range (NUM_EPOCHS):
    print(f"epoch--{i}")
    loss_count=0
    k=0
    T5.train()
    for batch in data_module:

      input_ids=torch.squeeze(batch["input_ids"])
      attention_mask=torch.squeeze(batch["attention_mask"])
      labels=torch.squeeze(batch["labels"])

      
      input_ids=input_ids.to(device)
      attention_mask=attention_mask.to(device)
      labels=labels.to(device)
      

      output=T5.forward(input_ids,attention_mask,labels)
      loss=output.loss/accumulation_steps
      loss.backward()
      k+=1

      # print(f"label shape --:{labels.shape}")
      # print(f"output shape --:{output.keys()}")

      if(k % accumulation_steps == 0):
        optimizer.step()
        optimizer.zero_grad()

      with torch.no_grad():
        loss_count+=loss.item()*accumulation_steps


        if(k % 50 ==0):
          mean_loss=loss_count/50
          print(f"average loss --:{mean_loss}")
          training_loss.append(mean_loss)
          loss_count=0

  return T5

In [None]:
model=model_training(data_module)


Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

epoch--0
average loss --:4.086854290962219
average loss --:3.849925547838211
average loss --:3.7359195411205293
average loss --:3.614553213119507
average loss --:3.5392633855342863
epoch--1
average loss --:3.4070584774017334
average loss --:3.4188349425792692
average loss --:3.304509735107422
average loss --:3.175950226187706
average loss --:3.1777968287467955
epoch--2
average loss --:3.056026482582092
average loss --:3.0732685208320616
average loss --:3.0503828167915343
average loss --:2.905544212460518
average loss --:2.912466874718666
epoch--3
average loss --:2.73279447555542
average loss --:2.8111242324113848
average loss --:2.7372071534395217
average loss --:2.6629266113042833
average loss --:2.657633674144745
epoch--4
average loss --:2.5187560260295867
average loss --:2.537011590600014
average loss --:2.4127118676900863
average loss --:2.3618388026952744
average loss --:2.4287737756967545
epoch--5
average loss --:2.272772526741028
average loss --:2.2624068766832353
average loss -

In [None]:
torch.save(model.state_dict(), Path('/content/drive/MyDrive/model.pt'))


In [None]:
def generate_answer(data_row,model):
  source_encoding=tokenizer(
    data_row["question"],
    data_row["passage"],
    max_length=80,
    padding="max_length",
    truncation="only_second", 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )
  source_encoding=source_encoding.to(device)

  generated_ids = model.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=1,
      max_length=80,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True,
      use_cache=True
  )

  preds=[
         tokenizer.decode(generated_id , skip_special_tokens=True , clean_up_tokenization_spaces=True)
         for generated_id in generated_ids
  ]

  return "".join(preds)

In [None]:
sample=train_df.iloc[18]
sample["question"]

"What are the educational requirements required to become a physician's assistant?"

In [None]:
sample["answer"]

"Complete your bachelor's degree (a science or healthcare related major is usually best); Gain experience either working or volunteering in a healthcare setting; Apply to ARC-PA accredited physician assistant programs; Complete a 2-3 year, master's level PA program;"

In [None]:
print(generate_answer(sample,model))


Complete your bachelor's degree (a science or occupational education) and become a licensed physician. You may apply to the state of New York State Department of Health


In [None]:
for i in train_df.index:
  train_df.at[i,"model_answer"]=generate_answer(train_df.iloc[i],model)


In [None]:
train_df.to_json(Path('/content/drive/MyDrive/train_df.json'))

In [None]:
def question_extractor( query_path : Path ):
  with query_path.open() as json_file:
    questions=json.load(json_file)


  question_rows=[]
  for Q in questions:
    question=Q["Rewrite"]
    answer=Q["Answer"]
    id=str(Q['Conversation_no'])+"_"+str(Q['Turn_no'])



    question_rows.append({
        "question":question,
        "answer":answer,
        "passage":'', #not really a passage , will be used to stack previous answers
        "id":id

    })

  return pd.DataFrame(question_rows)
      
def question_collector(question_df,train_df):
  #dataframes of questions and passages. passages also
  #contain the answers generated by the model. collects all the answer corresponding
  #to a certain id and stucks them into question_df "passage" key. Which will be 
  #used for model training
  for i in question_df.index:
    matching_answers=train_df.loc[train_df['id']==question_df.at[i,'id']]
    for item in matching_answers['model_answer']:
      question_df.at[i,"passage"]=str(question_df.at[i,"passage"]) +" . "+ str(item) #assembles the answers when they match the question








In [None]:
question_df=question_extractor(Path("/content/drive/MyDrive/qrecc_test.json"))
question_collector(question_df,train_df)
print(question_df.at[0,'answer'])
print(question_df.at[0,'passage'])
question_reduced=question_df.iloc[0:200]
question_reduced.head()

physician assistants are medical providers who are licensed to diagnose and treat illness and disease and to prescribe medication for patients
 . physician assistants are medical providers who may be assisting with diagnosing problems or illness, and should also consult the patient for consultation. . physician assistants are medical providers who are licensed to diagnose and treat illness, as well as other health professionals who may be involved in consultation or counseling. Physician . Licensed Nurses Assistant (AP) – An assistant who can be trained to diagnose and treat illness, disease or stroke. . Physician Assistants provide clinical research and treatment for patients who may be employed by the healthcare industry or local government. Physician assistants are licensed to diagnose, treat disease . physician assistants are medical providers who may prescribe medications for their patients, while physician associate programs generally have a better reputation as clinicians. Physi

Unnamed: 0,question,answer,passage,id
0,What is a physician's assistant?,physician assistants are medical providers who...,. physician assistants are medical providers ...,1_1
1,What are the educational requirements required...,Complete your bachelor's degree (a science or ...,. Complete your bachelor's degree (a science ...,1_2
2,What does it cost to become a physician's assi...,Average Cost Across all PA Schools for the 201...,. Average Cost Across all PA Schools for the ...,1_3
3,What's the average starting salary for a physi...,Typical starting salaries for physician associ...,. Typical starting salaries for physician ass...,1_4
4,What's the average starting salary for a physi...,An early career Physician Assistant (PA) with ...,. An early career Physician Assistant (PA) wi...,1_5


In [None]:
print(len(question_df))
print(len(train_df))

16451
2020


In [None]:
data_module2=QADataset(question_df,tokenizer,)


In [None]:
model_training(data_module2)

epoch--0


ValueError: ignored