<a href="https://colab.research.google.com/github/MikeDoes/ETH_NLP_Project/blob/main/NLP_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
!pip install sentencepiece
!pip install tokenizers
!pip install transformers

import urllib
import torch.nn as nn
import json
import torch
import pandas as pd
import numpy as np
import transformers
from transformers import BertTokenizer, BertForPreTraining, BertConfig  , AdamW  
from pathlib import Path
from torch.utils.data import Dataset , DataLoader



In [38]:

!nvidia-smi


NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [39]:
N_EPOCHS=25
accumulation_steps=5
NUM_EPOCHS=N_EPOCHS
BATCH_SIZE=4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# config = BertConfig.from_pretrained("bert-base-cased")
MODEL_NAME= BertForPreTraining.from_pretrained('bert-base-cased') #config=config
tokenizer= BertTokenizer.from_pretrained('bert-base-cased')

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
### DOWNLOAD DATA FROM MIKEDOES (A LOT)


url='https://raw.githubusercontent.com/MikeDoes/ETH_NLP_Project/main/fin_num_merged.json'
response = urllib.request.urlopen(url)

  
data=json.loads(response.read())  

print(len( data ))

with open('/content/data.json', 'w') as f:
    json.dump(data, f)


print(data[0])

409
{'paragraph': "In care delivery our clinical leaders are applying clinical physician support based on evidence-based guidelines that promote better health and ensure the right care at the right time in the right setting. Today 99% of OptumCare patients in our advanced form of Medicare value arrangements are in 4-star plans or better and OptumCare's average Net Promoter Score is nearly 80 evidence of outstanding clinical outcomes and patient experiences.", 'entities': [{'target_num': '80', 'category': 'other', 'offset_start': 373, 'offset_end': 375, 'claim': 0}]}


In [41]:
###BUILDING PD DATAFRAME

def extract_qa( path_1 : Path):
  with path_1.open() as json_file:
    data=json.load(json_file)
  data_rows=[]
  for element in data:
    paragraph=element["paragraph"]
    target=element["entities"][0]["target_num"]
    category=element["entities"][0]["category"]
    offset_start=element["entities"][0]["offset_start"]
    offset_end=element["entities"][0]["offset_end"]

    data_rows.append({
        "paragraph":paragraph,
        "target":target,
        "category":category,
        "offset_start":offset_start,
        "offset_end":offset_end,
        "model_prediction_category":'',
        "model_prediction_entity":''

    })

  return pd.DataFrame(data_rows)
      

train_df=extract_qa(Path("/content/data.json"))

element=train_df.iloc[0]
print(element)

paragraph                    In care delivery our clinical leaders are appl...
target                                                                      80
category                                                                 other
offset_start                                                               373
offset_end                                                                 375
model_prediction_category                                                     
model_prediction_entity                                                       
Name: 0, dtype: object


In [42]:
class QADataset(Dataset):
  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: tokenizer,
      source_max_token_len: int=512,
      target_max_token_len: int=8,
  ):

      self.tokenizer=tokenizer,
      self.data=data
      self.source_max_token_len=source_max_token_len
      self.target_max_token_len=target_max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index:int):

    ##DATA ROW
    data_row=self.data.iloc[index]


    ##SOURCE ENCODING
    source_encoding=tokenizer(
    data_row["paragraph"],
    max_length=self.source_max_token_len,
    padding="max_length",
    truncation=True, 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )

    ##TARGET ENCODING
    target_encoding=tokenizer(
    data_row["target"],
    max_length=self.target_max_token_len,
    padding="max_length",
    truncation=True, 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )

    labels=target_encoding["input_ids"]
    labels[labels==0]=-100


    return dict(
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"],
        labels=labels,
    )


In [43]:
class QAmodel(nn.Module):

  def __init__(self):
    super().__init__()
    self.model = MODEL_NAME
    

  def forward(self , input_ids , attention_mask , labels ):

    output=self.model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels=labels
    )   

    
    return output


  def configure_optimizers(self):
    return AdamW(self.parameters(),lr=0.0001)



In [44]:

dataset=QADataset(train_df,tokenizer,)
print(dataset.__len__())
data_module = DataLoader(dataset, batch_size=BATCH_SIZE,shuffle="True")


409


In [47]:
def model_training(data_module):
  BERT=QAmodel()


  BERT=BERT.to(device)
  optimizer=BERT.configure_optimizers()

  training_loss=[]
  _validation_loss=[]
  
  optimizer.zero_grad()
  for i in range (NUM_EPOCHS):
    print(f"epoch--{i}")
    loss_count=0
    k=0
    BERT.train()
    for batch in data_module:

      input_ids=torch.squeeze(batch["input_ids"])
      attention_mask=torch.squeeze(batch["attention_mask"])
      labels=torch.squeeze(batch["labels"])

      print(input_ids.shape)
      print(attention_mask.shape)
      print(labels.shape)
      
      input_ids=input_ids.to(device)
      attention_mask=attention_mask.to(device)
      labels=labels.to(device)
      

      output=BERT.forward(input_ids,attention_mask,labels)
      loss=output.loss/accumulation_steps
      loss.backward()
      k+=1

      if(k % accumulation_steps == 0):
        optimizer.step()
        optimizer.zero_grad()

      with torch.no_grad():
        loss_count+=loss.item()*accumulation_steps


        if(k % 50 ==0):
          mean_loss=loss_count/50
          print(f"average loss --:{mean_loss}")
          training_loss.append(mean_loss)
          loss_count=0

  return BERT

In [48]:
model_training(data_module)

epoch--0
torch.Size([4, 512])
torch.Size([4, 512])
torch.Size([4, 8])


TypeError: ignored