<a href="https://colab.research.google.com/github/MikeDoes/ETH_NLP_Project/blob/main/NLP_T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!pip install sentencepiece
!pip install tokenizers
!pip install transformers

import json
import urllib
import torch.nn as nn
import json
import torch
import pandas as pd
import numpy as np
import transformers
from transformers import T5ForConditionalGeneration , T5Tokenizer  , AdamW
from pathlib import Path
from torch.utils.data import Dataset , DataLoader



In [24]:

!nvidia-smi


Mon Dec 13 08:26:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    57W / 149W |   5209MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [25]:
N_EPOCHS=25
accumulation_steps=1
NUM_EPOCHS=N_EPOCHS
BATCH_SIZE=8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME="t5-small"
tokenizer=T5Tokenizer.from_pretrained(MODEL_NAME)

In [26]:
#DOWLOAND AND PROCESS TRAINING DATASET

url='https://raw.githubusercontent.com/MikeDoes/ETH_NLP_Project/main/fin_num_train.json'
response = urllib.request.urlopen(url)
unprocessed_traindataset=json.loads(response.read())  

processed_traindataset=[]

for element in unprocessed_traindataset:
  input=element['paragraph']
  target=''
  for entity in element['entities']:
    target=target + entity['target_num'] + '-' + entity['category'] + ' $ '
  # print(entity)
  # print(target)
  processed_traindataset.append({
      'input':input,
      'target': target
  })

with open('/content/train_dataset.json', 'w') as f:
    json.dump(processed_traindataset, f)


In [27]:
#DOWLOAND AND PROCESS TRAINING DATASET

url='https://raw.githubusercontent.com/MikeDoes/ETH_NLP_Project/main/fin_num_validate.json'
response = urllib.request.urlopen(url)
unprocessed_traindataset=json.loads(response.read())  

processed_traindataset=[]

for element in unprocessed_traindataset:
  input=element['paragraph']
  target=''
  for entity in element['entities']:
    target=target + entity['target_num'] + '-' + entity['category'] + ' $ '
  # print(entity)
  # print(target)
  processed_traindataset.append({
      'input':input,
      'target': target
  })

with open('/content/validate_dataset.json', 'w') as f:
    json.dump(processed_traindataset, f)

In [28]:
###BUILDING PD DATAFRAME

def extract_dataframe( path_1 : Path):
  with path_1.open() as json_file:
    data=json.load(json_file)
  data_rows=[]
  for element in data:
    input=element['input']
    target=element['target']

    data_rows.append({
        'input' : input ,
        'target' : target ,
    })

  return pd.DataFrame(data_rows)
      

train_df=extract_dataframe(Path("/content/train_dataset.json"))
validate_df=extract_dataframe(Path("/content/validate_dataset.json"))


In [29]:
class QADataset(Dataset):
  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: T5Tokenizer,
      source_max_token_len: int=396,
      target_max_token_len: int=32,
  ):

      self.tokenizer=tokenizer,
      self.data=data
      self.source_max_token_len=source_max_token_len
      self.target_max_token_len=target_max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index:int):

    ##DATA ROW
    data_row=self.data.iloc[index]


    ##SOURCE ENCODING
    source_encoding=tokenizer(
    data_row['input'],
    max_length=self.source_max_token_len,
    padding="max_length",
    truncation=True, 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )

    

    ##TARGET ENCODING
    target_encoding=tokenizer(
    data_row["target"] ,
    max_length=self.target_max_token_len,
    padding="max_length",
    truncation=True, 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )

    labels=target_encoding["input_ids"]
    labels[labels==0]=-100


    return dict(
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"],
        labels=labels,
    )


In [30]:
class QAmodel(nn.Module):

  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME , return_dict=True).to(device)
    

  def forward(self , input_ids , attention_mask , labels ):

    output=self.model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels=labels
    )   

    
    return output


  def configure_optimizers(self):
    return AdamW(self.parameters(),lr=0.0001)



In [31]:

train_dataset=QADataset(train_df,tokenizer,)
validate_dataset=QADataset(validate_df,tokenizer)

train_module = DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle="True", drop_last=True)
validate_module = DataLoader(validate_dataset, batch_size=BATCH_SIZE , shuffle="True" , drop_last= True)

In [32]:
def model_training(data_module):
  T5=QAmodel()


  T5=T5.to(device)
  optimizer=T5.configure_optimizers()

  training_loss=[]
  _validation_loss=[]
  
  optimizer.zero_grad()
  for i in range (NUM_EPOCHS):
    print(f"epoch--{i}")
    loss_count=0
    validation_loss=0
    k=0
    T5.train()
    for batch in data_module:

      input_ids=torch.squeeze(batch["input_ids"])
      attention_mask=torch.squeeze(batch["attention_mask"])
      labels=torch.squeeze(batch["labels"])
      
      input_ids=input_ids.to(device)
      attention_mask=attention_mask.to(device)
      labels=labels.to(device)
      

      output=T5.forward(input_ids,attention_mask,labels)
      loss=output.loss/accumulation_steps
      loss.backward()
      k+=1

      if(k % accumulation_steps == 0):
        optimizer.step()
        optimizer.zero_grad()

      with torch.no_grad():
        loss_count+=loss.item()*accumulation_steps
        
        if(k % 10 ==0):
          mean_loss=loss_count/10
          print(f"average loss --:{mean_loss}")
          training_loss.append(mean_loss)
          loss_count=0

      
    k=0
    T5.eval()
    for batch in validate_module:

      input_ids=torch.squeeze(batch["input_ids"])
      attention_mask=torch.squeeze(batch["attention_mask"])
      labels=torch.squeeze(batch["labels"])
        
      input_ids=input_ids.to(device)
      attention_mask=attention_mask.to(device)
      labels=labels.to(device)
        
      k=k+1
      output=T5.forward(input_ids,attention_mask,labels)
      validation_loss+=loss.item()  

    print(f"validation loss: {validation_loss/k} ") 

  return T5

In [None]:
model=model_training(train_module)

In [47]:
def generate_prediction(data_row,model):
  source_encoding=tokenizer(
    data_row["input"],
    max_length=512,
    padding="max_length",
    truncation="only_second", 
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt" )
  source_encoding=source_encoding.to(device)

  generated_ids = model.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=1,
      max_length=128,
      min_length=5,
      top_p=0.9,
      repetition_penalty=2.5,
      length_penalty=1.5,
      early_stopping=True,
      use_cache=True

  )

  preds=[
         tokenizer.decode(generated_id , skip_special_tokens=True , clean_up_tokenization_spaces=True)
         for generated_id in generated_ids
  ]

  return "".join(preds)

In [49]:
sample=train_df.iloc[1]
print(generate_prediction(sample,model))
print( sample["target"])
print( sample["input"])

2018-date $ 10-change $ 9-change $ 1.5-change $ 8.2-money $ 2019.-date $ 23-relative $ 2018.-
2018-date $ 10-change $ 9-change $ 1.5-change $ 8.2-money $ 2019.-date $ 23-relative $ 2017.-date $ 11-relative $ 100-money $ 
Turning to Optum's financial results. Full year 2018 revenues surpassed $100 billion for the first time. Revenue growth of over $10 billion for the year accelerated to 11% from 9% in 2017. And likewise our operating margins once again strengthened across the Optum portfolio with our overall operating earnings growing more than $1.5 billion or 23% to $8.2 billion reflecting the leverage of Optum's scale businesses and putting us in a strong baseline earnings position entering 2019.
