In [5]:
!pip install sentencepiece
!pip install tokenizers
!pip install transformers

import urllib
import torch.nn as nn
import json
import torch
import pandas as pd
import numpy as np
import transformers
from transformers import BertTokenizer, BertLMHeadModel, BertConfig  , AdamW 
from pathlib import Path
from torch.utils.data import Dataset , DataLoader



In [6]:

!nvidia-smi


Tue Nov 30 14:26:29 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8    28W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
N_EPOCHS=25
accumulation_steps=5
NUM_EPOCHS=N_EPOCHS
BATCH_SIZE=4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = BertConfig.from_pretrained("bert-base-cased")
MODEL_NAME= BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
tokenizer = tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [8]:
### DOWNLOAD DATA FROM MIKEDOES (A LOT)


url='https://raw.githubusercontent.com/MikeDoes/ETH_NLP_Project/main/fin_num_merged.json'
response = urllib.request.urlopen(url)

  
data=json.loads(response.read())  

with open('/content/data.json', 'w') as f:
    json.dump(data, f)


print(data[0])

{'paragraph': "In care delivery our clinical leaders are applying clinical physician support based on evidence-based guidelines that promote better health and ensure the right care at the right time in the right setting. Today 99% of OptumCare patients in our advanced form of Medicare value arrangements are in 4-star plans or better and OptumCare's average Net Promoter Score is nearly 80 evidence of outstanding clinical outcomes and patient experiences.", 'entities': [{'target_num': '80', 'category': 'other', 'offset_start': 373, 'offset_end': 375, 'claim': 0}]}


In [9]:
###BUILDING PD DATAFRAME

def extract_qa( path_1 : Path):
  with path_1.open() as json_file:
    data=json.load(json_file)
  data_rows=[]
  for element in data:
    paragraph=element["paragraph"]
    target=element["entities"][0]["target_num"]
    category=element["entities"][0]["category"]
    offset_start=element["entities"][0]["offset_start"]
    offset_end=element["entities"][0]["offset_end"]

    data_rows.append({
        "paragraph":paragraph,
        "target":target,
        "category":category,
        "offset_start":offset_start,
        "offset_end":offset_end,
        "model_prediction_category":'',
        "model_prediction_entity":''

    })

  return pd.DataFrame(data_rows)
      

train_df=extract_qa(Path("/content/data.json"))

element=train_df.iloc[0]
print(element)

paragraph                    In care delivery our clinical leaders are appl...
target                                                                      80
category                                                                 other
offset_start                                                               373
offset_end                                                                 375
model_prediction_category                                                     
model_prediction_entity                                                       
Name: 0, dtype: object


In [10]:
class QADataset(Dataset):
  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: tokenizer,
      source_max_token_len: int=396,
      target_max_token_len: int=8,
  ):

      self.tokenizer=tokenizer,
      self.data=data
      self.source_max_token_len=source_max_token_len
      self.target_max_token_len=target_max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index:int):

    ##DATA ROW
    data_row=self.data.iloc[index]


    ##SOURCE ENCODING
    source_encoding=tokenizer(
    data_row["paragraph"],
    max_length=self.source_max_token_len,
    padding="max_length",
    truncation=True, 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )

    ##TARGET ENCODING
    target_encoding=tokenizer(
    data_row["target"],
    max_length=self.target_max_token_len,
    padding="max_length",
    truncation=True, 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )

    labels=target_encoding["input_ids"]
    labels[labels==0]=-100


    return dict(
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"],
        labels=labels,
    )


In [11]:
class QAmodel(nn.Module):

  def __init__(self):
    super().__init__()
    self.model = MODEL_NAME
    

  def forward(self , input_ids , attention_mask , labels ):

    output=self.model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels=labels
    )   

    
    return output


  def configure_optimizers(self):
    return AdamW(self.parameters(),lr=0.0001)



In [12]:

dataset=QADataset(train_df,tokenizer,)
print(dataset.__len__())
data_module = DataLoader(dataset, batch_size=BATCH_SIZE,shuffle="True")
for batch in data_module:
    print(batch["labels"].shape)
    print(batch["input_ids"].shape)
    print(batch["attention_mask"].shape)

409
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1, 8])
torch.Size([4, 1, 396])
torch.Size([4, 1, 396])
torch.Size([4, 1

In [13]:
def model_training(data_module):
  BERT=QAmodel()


  BERT=BERT.to(device)
  optimizer=BERT.configure_optimizers()

  training_loss=[]
  _validation_loss=[]
  
  optimizer.zero_grad()
  for i in range (NUM_EPOCHS):
    print(f"epoch--{i}")
    loss_count=0
    k=0
    BERT.train()
    for batch in data_module:

      input_ids=torch.squeeze(batch["input_ids"])
      attention_mask=torch.squeeze(batch["attention_mask"])
      labels=torch.squeeze(batch["labels"])

      print(input_ids.shape)
      print(attention_mask.shape)
      print(labels.shape)
      
      input_ids=input_ids.to(device)
      attention_mask=attention_mask.to(device)
      labels=labels.to(device)
      

      output=BERT.forward(input_ids,attention_mask,labels)
      loss=output.loss/accumulation_steps
      loss.backward()
      k+=1

      if(k % accumulation_steps == 0):
        optimizer.step()
        optimizer.zero_grad()

      with torch.no_grad():
        loss_count+=loss.item()*accumulation_steps


        if(k % 50 ==0):
          mean_loss=loss_count/50
          print(f"average loss --:{mean_loss}")
          training_loss.append(mean_loss)
          loss_count=0

  return BERT

In [14]:
model=model_training(data_module)


epoch--0
torch.Size([4, 396])
torch.Size([4, 396])
torch.Size([4, 8])


ValueError: ignored

In [None]:
torch.save(model.state_dict(), Path('/content/drive/MyDrive/model.pt'))


In [None]:
def generate_answer(data_row,model):
  source_encoding=tokenizer(
    data_row["question"],
    data_row["passage"],
    max_length=80,
    padding="max_length",
    truncation="only_second", 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )
  source_encoding=source_encoding.to(device)

  generated_ids = model.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=1,
      max_length=80,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True,
      use_cache=True
  )

  preds=[
         tokenizer.decode(generated_id , skip_special_tokens=True , clean_up_tokenization_spaces=True)
         for generated_id in generated_ids
  ]

  return "".join(preds)

In [None]:
sample=train_df.iloc[18]
sample["question"]

In [None]:
sample["answer"]

In [None]:
print(generate_answer(sample,model))


In [None]:
for i in train_df.index:
  train_df.at[i,"model_answer"]=generate_answer(train_df.iloc[i],model)


In [None]:
train_df.to_json(Path('/content/drive/MyDrive/train_df.json'))

In [None]:
def question_extractor( query_path : Path ):
  with query_path.open() as json_file:
    questions=json.load(json_file)


  question_rows=[]
  for Q in questions:
    question=Q["Rewrite"]
    answer=Q["Answer"]
    id=str(Q['Conversation_no'])+"_"+str(Q['Turn_no'])



    question_rows.append({
        "question":question,
        "answer":answer,
        "passage":'', #not really a passage , will be used to stack previous answers
        "id":id

    })

  return pd.DataFrame(question_rows)
      
def question_collector(question_df,train_df):
  #dataframes of questions and passages. passages also
  #contain the answers generated by the model. collects all the answer corresponding
  #to a certain id and stucks them into question_df "passage" key. Which will be 
  #used for model training
  for i in question_df.index:
    matching_answers=train_df.loc[train_df['id']==question_df.at[i,'id']]
    for item in matching_answers['model_answer']:
      question_df.at[i,"passage"]=str(question_df.at[i,"passage"]) +" . "+ str(item) #assembles the answers when they match the question








In [None]:
question_df=question_extractor(Path("/content/drive/MyDrive/qrecc_test.json"))
question_collector(question_df,train_df)
print(question_df.at[0,'answer'])
print(question_df.at[0,'passage'])
question_reduced=question_df.iloc[0:200]
question_reduced.head()

In [None]:
print(len(question_df))
print(len(train_df))

In [None]:
data_module2=QADataset(question_df,tokenizer,)


In [None]:
model_training(data_module2)