In [None]:
!pip install transformers
!pip install evaluate
!pip install rouge

import re
import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
TOKENIZER = T5TokenizerFast.from_pretrained("t5-base")
MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
Q_LEN = 256   # Question Length
T_LEN = 32    # Target Length
BATCH_SIZE = 4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
###Currently appending just one question
def data_prepare():
  articles=[]
  with open('train.json', 'r') as f:
          data = json.load(f)
  for key,value in data.items():
          doc = value["text"]
          print(key)
          for event in value["events"]:
              if(event["manual"]==[]):
                continue
              else:
                trigger = event["trigger"]
                event_id = event["event_id"]
                doc = re.sub(rf"\b{trigger}\b", f"<b>{trigger}</b>", doc, flags=re.IGNORECASE)
                question = event["manual"][0]["question"]
                answer = event["manual"][0]['answer']
                inputs = {"context": doc, "question": question, "answer": answer}
                articles.append(inputs)
  return articles

In [None]:
data = data_prepare()
data = pd.DataFrame(data)

wiki_ied_bombings_0_news_7
scenario_en_122
scenario_en_kairos_13
scenario_en_kairos_35
scenario_en_kairos_14
scenario_en_kairos_65
scenario_en_kairos_44
scenario_en_kairos_86
scenario_en_130
scenario_en_kairos_18
scenario_en_121
scenario_en_67
scenario_en_kairos_0
scenario_en_kairos_43
50_VOA_EN_NW_2015.06.25.2836393
8_VOA_EN_NW_2014.05.30.1925959
scenario_en_kairos_37
suicide_ied_110
scenario_en_kairos_46
K0C041NHW
49_VOA_EN_NW_2015.04.15.2720171
road_ied_7
wiki_mass_car_bombings_1_news_9
55_VOA_EN_NW_2014.06.30.1947653
7_VOA_EN_NW_2017.03.07.3753687
road_ied_13
backpack_ied_16
27_VOA_EN_NW_2015.04.06.2708308
backpack_ied_18
scenario_en_kairos_38
66_VOA_EN_NW_2013.05.30.1671716
suicide_ied_12
suicide_ied_106
suicide_ied_105
scenario_en_123
wiki_ied_bombings_5_news_1
31_VOA_EN_NW_2015.05.15.2768933
scenario_en_37
road_ied_4
scenario_en_kairos_41
wiki_ied_bombings_3_news_5
wiki_ied_bombings_1_news_8
JC002YBVY
scenario_en_131
wiki_mass_car_bombings_0_news_10
wiki_mass_car_bombings_1_news

In [None]:
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.data = dataframe
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data['answer']

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]

        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)

        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100

        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }

In [None]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [None]:
train_loss = 0
val_loss = 0
train_batch_count = 0
val_batch_count = 0

for epoch in range(10):
    MODEL.to(DEVICE)
    MODEL.train()
    for batch in tqdm(train_loader, desc="Training batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        train_loss += outputs.loss.item()
        train_batch_count += 1

        MODEL.eval()
    for batch in tqdm(val_loader, desc="Validation batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        val_loss += outputs.loss.item()
        val_batch_count += 1

    print(f"{epoch+1}/{10} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

Training batches:   1%|▏         | 2/158 [00:32<41:42, 16.04s/it]

In [None]:
MODEL.save_pretrained("qa_model")
TOKENIZER.save_pretrained("qa_tokenizer")

# Saved files
"""('qa_tokenizer/tokenizer_config.json',
 'qa_tokenizer/special_tokens_map.json',
 'qa_tokenizer/spiece.model',
'qa_tokenizer/added_tokens.json',
'qa_tokenizer/tokenizer.json')"""

"('qa_tokenizer/tokenizer_config.json',\n 'qa_tokenizer/special_tokens_map.json',\n 'qa_tokenizer/spiece.model',\n'qa_tokenizer/added_tokens.json',\n'qa_tokenizer/tokenizer.json')"

In [None]:
def predict_answer(context, question, ref_answer=None):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)

    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)

    if ref_answer:
        # Load the Bleu metric
        bleu = evaluate.load("google_bleu")
        score = bleu.compute(predictions=[predicted_answer],
                            references=[ref_answer])

        print("Context: \n", context)
        print("\n")
        print("Question: \n", question)
        return {
            "Reference Answer: ": ref_answer,
            "Predicted Answer: ": predicted_answer,
            "BLEU Score: ": score
        }
    else:
        return predicted_answer

In [None]:
with open('test.json', 'r') as f:
        data = json.load(f)
data_copy=data
for key,value in data.items():
        doc = value["text"]
        print(key)
        for event in value["events"]:
                trigger = event["trigger"]
                event_id = event["event_id"]
                doc = re.sub(rf"\b{trigger}\b", f"<b>{trigger}</b>", doc, flags=re.IGNORECASE)
                for qa_pair in event["manual"]:
                        question = qa_pair["question"]
                        answer = qa_pair['answer']
                        pred = predict_answer(doc,question)
                        for k,v in data_copy.items():
                                if(key==k):
                                        for e in v["events"]:
                                                e_id=e["event_id"]
                                                for qap in e['manual']:
                                                        if(qap["question"]==question):
                                                                qap['answer']=pred
                                                                qap['gold_ans']=answer

pretty_json = json.dumps(data_copy, indent=4)
output_file_path = 'ans_id_test_pretty_nodemo.json'
with open(output_file_path, 'w') as output_file:
        output_file.write(pretty_json)