## Data Process

In [13]:
! pip3 install -q -U datasets==2.17.0

In [14]:
import json
import pandas as pd
from datasets import load_dataset

def data_process(filename):
  lines = []
  if filename == 'train':
    lines = load_dataset("nyu-mll/multi_nli", split="train[:20000]")
    lines.to_json("train.json")
    with open(r'/content/train.json') as f:
      lines = f.read().splitlines()
    line_dicts = [json.loads(line) for line in lines]
    print(line_dicts[0])

    df = pd.DataFrame(columns=["prompt", "answer"])

    for line_dict in line_dicts:
      # prompt = "Giving a premise: '" + line_dict["premise"] + "' Giving a hypothesis: '" \
      #       + line_dict["hypothesis"] + "' Predict whether the hypothesis is entailed" \
      #       + ", contradicted, or neutral given the premise."
      prompt = "Giving a premise: '" + line_dict["premise"] + "' The parse of the premise is:'" \
              + line_dict["premise_binary_parse"] + "' Giving a hypothesis: '" \
              + line_dict["hypothesis"] + "' The parse of the hypothesis is:'" +\
              line_dict["hypothesis_binary_parse"] + "' Predict whether the hypothesis is entailed" \
              + ", contradicted, or neutral given the premise."
      label = line_dict["label"]
      if label == 0:
        answer = "entailment"
      elif label == 1:
        answer = "neutral"
      elif label == 2:
        answer = "contradiction"
      # entailment (0), neutral (1), contradiction (2)
      df.loc[len(df)] = [prompt, answer]

    print(df)
    return df
  elif filename == 'evaluation_match':
    with open(r'/content/dev_matched_sampled-1.jsonl') as f:
      lines = f.read().splitlines()
  elif filename == 'evaluation_mismatch':
    with open(r'/content/dev_mismatched_sampled-1.jsonl') as f:
      lines = f.read().splitlines()

  line_dicts = [json.loads(line) for line in lines]
  print(line_dicts[0])

  df = pd.DataFrame(columns=["prompt", "answer"])

  for line_dict in line_dicts:
    # prompt = "Giving a premise: '" + line_dict["sentence1"] + "' Giving a hypothesis: '" \
    #         + line_dict["sentence2"] + "' Predict whether the hypothesis is entailed" \
    #         + ", contradicted, or neutral given the premise."
    prompt = "Giving a premise: '" + line_dict["sentence1"] + "' The parse of the premise is:'" \
              + line_dict["sentence1_binary_parse"] + "' Giving a hypothesis: '" \
              + line_dict["sentence2"] + "' The parse of the hypothesis is:'" +\
              line_dict["sentence2_binary_parse"] + "' Predict whether the hypothesis is entailed" \
              + ", contradicted, or neutral given the premise."
    answer = line_dict["gold_label"]
    df.loc[len(df)] = [prompt, answer]

  print(df)
  return df

In [15]:
train_df = data_process('train')
evaluation_match_df = data_process('evaluation_match')
evaluation_mismatch_df = data_process('evaluation_mismatch')

Creating json from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

{'promptID': 31193, 'pairID': '31193n', 'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.', 'premise_binary_parse': '( ( Conceptually ( cream skimming ) ) ( ( has ( ( ( two ( basic dimensions ) ) - ) ( ( product and ) geography ) ) ) . ) )', 'premise_parse': '(ROOT (S (NP (JJ Conceptually) (NN cream) (NN skimming)) (VP (VBZ has) (NP (NP (CD two) (JJ basic) (NNS dimensions)) (: -) (NP (NN product) (CC and) (NN geography)))) (. .)))', 'hypothesis': 'Product and geography are what make cream skimming work. ', 'hypothesis_binary_parse': '( ( ( Product and ) geography ) ( ( are ( what ( make ( cream ( skimming work ) ) ) ) ) . ) )', 'hypothesis_parse': '(ROOT (S (NP (NN Product) (CC and) (NN geography)) (VP (VBP are) (SBAR (WHNP (WP what)) (S (VP (VBP make) (NP (NP (NN cream)) (VP (VBG skimming) (NP (NN work)))))))) (. .)))', 'genre': 'government', 'label': 1}
                                                  prompt         answer
0      Giving a prem

## Seq-to-seq model: T5

## T5

In [16]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(Column("Epoch", justify="center" ),
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"),
                        title="Training Status",pad_edge=False, box=box.ASCII)

In [17]:
class YourDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    #cleaning data so as to ensure data is in string type
    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long),
        'source_mask': source_mask.to(dtype=torch.long),
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }


In [18]:
def train(epoch, tokenizer, model, device, loader, optimizer):

  """
  Function to be called for training with the parameters passed from main function

  """

  model.train()
  for _,data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    if _%10==0:
      training_logger.add_row(str(epoch), str(_), str(loss))
      # console.print(training_logger)
      console.print(str(epoch) + str(_) + str(loss))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [19]:
def validate(epoch, tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask,
              max_length=150,
              num_beams=2,
              repetition_penalty=2.5,
              length_penalty=1.0,
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals

In [20]:
def T5Trainer(dataframe, source_text, target_text, model_params, output_dir="./outputs/" ):

  """
  T5 trainer

  """

  # Set random seeds and deterministic pytorch for reproducibility
  torch.manual_seed(model_params["SEED"]) # pytorch random seed
  np.random.seed(model_params["SEED"]) # numpy random seed
  torch.backends.cudnn.deterministic = True

  # logging
  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

  # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
  # Further this model is sent to device (GPU/TPU) for using the hardware.
  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)

  # logging
  console.log(f"[Data]: Reading data...\n")

  # Importing the raw dataset
  dataframe = dataframe[[source_text,target_text]]
  display_df(dataframe.head(2))


  # Creation of Dataset and Dataloader
  # train_size = 0.9
  #train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
  #val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
  #train_dataset = train_dataset.reset_index(drop=True)
  train_dataset = train_df
  val_dataset_match = evaluation_match_df
  val_dataset_mismatch = evaluation_mismatch_df

  console.print(f"FULL Dataset: {dataframe.shape}")
  console.print(f"TRAIN Dataset: {train_dataset.shape}")
  console.print(f"TEST Dataset Match: {val_dataset_match.shape}\n")
  console.print(f"TEST Dataset Mismatch: {val_dataset_mismatch.shape}\n")


  # Creating the Training and Validation dataset for further creation of Dataloader
  training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  val_set_match = YourDataSetClass(val_dataset_match, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  val_set_mismatch = YourDataSetClass(val_dataset_mismatch, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)

  # Defining the parameters for creation of dataloaders
  train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


  val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


  # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  training_loader = DataLoader(training_set, **train_params)
  val_loader_match = DataLoader(val_set_match, **val_params)
  val_loader_mismatch = DataLoader(val_set_mismatch, **val_params)


  # Defining the optimizer that will be used to tune the weights of the network in the training session.
  optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])


  # Training loop
  console.log(f'[Initiating Fine Tuning]...\n')

  for epoch in range(model_params["TRAIN_EPOCHS"]):
      train(epoch, tokenizer, model, device, training_loader, optimizer)

  console.log(f"[Saving Model]...\n")
  #Saving the model after training
  path = os.path.join(output_dir, "model_files")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)


  # evaluating test dataset 1
  console.log(f"[Initiating Validation Match]...\n")
  for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader_match)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv(os.path.join(output_dir,'predictions_match.csv'))

  console.save_text(os.path.join(output_dir,'logs_match.txt'))

  console.log(f"[Validation Completed.]\n")
  console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
  console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions_match.csv')}\n""")
  console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs_match.txt')}\n""")

  # evaluating test dataset 2
  console.log(f"[Initiating Validation Mismatch]...\n")
  for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader_mismatch)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv(os.path.join(output_dir,'predictions_mismatch.csv'))

  console.save_text(os.path.join(output_dir,'logs_mismatch.txt'))

  console.log(f"[Validation Completed.]\n")
  console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
  console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions_mismatch.csv')}\n""")
  console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs_mismatch.txt')}\n""")

In [21]:
model_params={
    "MODEL":"t5-base",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":16,          # training batch size
    "VALID_BATCH_SIZE":16,          # validation batch size
    "TRAIN_EPOCHS":3,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":50,   # max length of target text
    "SEED": 42                     # set seed for reproducibility

}


In [22]:
T5Trainer(dataframe=train_df, source_text="prompt", target_text="answer", model_params=model_params, output_dir="outputs")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:

! cp /content/outputs /content/drive/MyDrive/Cap-T5-newp2  -r

In [24]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Cap-T5-newp2/outputs/predictions_match.csv')
success_rate = []
for index, row in df.iterrows():
    if row['Generated Text'] == row['Actual Text']:
      success_rate.append(1)
    else:
      success_rate.append(0)
accuracy = sum(success_rate)/len(success_rate)
print(f"Accuracy: {accuracy*100:.2f}%")

df = pd.read_csv('/content/drive/MyDrive/Cap-T5-newp2/outputs/predictions_mismatch.csv')
success_rate = []
for index, row in df.iterrows():
    if row['Generated Text'] == row['Actual Text']:
      success_rate.append(1)
    else:
      success_rate.append(0)
accuracy = sum(success_rate)/len(success_rate)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 38.80%
Accuracy: 38.68%
