# CS546 Assignment 2: Prompt Learning for Event Detection

### Google Colab Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'CS546_A2'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'MyDrive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

['helper.py', 'data', '__pycache__', 'checkpoint_best.pt', 'Assignment2.ipynb']


In [4]:
import sys
sys.path.append(GOOGLE_DRIVE_PATH)

import time, os
os.environ["TZ"] = "US/Central"
time.tzset()

In [5]:
!pip install openprompt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openprompt
  Downloading openprompt-1.0.1-py3-none-any.whl (146 kB)
[K     |████████████████████████████████| 146 kB 8.5 MB/s 
Collecting datasets
  Downloading datasets-2.5.2-py3-none-any.whl (432 kB)
[K     |████████████████████████████████| 432 kB 43.8 MB/s 
[?25hCollecting tensorboardX
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 56.8 MB/s 
[?25hCollecting sentencepiece==0.1.96
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 46.1 MB/s 
[?25hCollecting transformers>=4.10.0
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 48.9 MB/s 
[?25hCollecting rouge==1.0.0
  Downloading rouge-1.0.0-py3-none-any.whl (14 kB)
Collecting yacs
  Downloading 

### Data Processing

In [6]:
import tqdm
import json
from helper import get_vocab, process_data, get_plm, get_template, my_collate_fn, get_verbalizer, to_device, convert_labels_to_list, loss_func, evaluation, predict

In [7]:
from openprompt import PromptForClassification
from openprompt.prompts import ManualTemplate, SoftTemplate
from openprompt import PromptDataLoader
from openprompt.prompts import ManualVerbalizer, SoftVerbalizer
import torch
from transformers import  AdamW, get_linear_schedule_with_warmup

train_file = "train.json"
valid_file =  "valid.json"
test_file =  "test.json"

model_file = "checkpoint_best.pt"

""" For dataset with full event types """
# train_file = "train_full.json"
# valid_file =  "valid_full.json"
# test_file =  "test_full.json"

train_dir = os.path.join(GOOGLE_DRIVE_PATH,"data",train_file)
valid_dir = os.path.join(GOOGLE_DRIVE_PATH,"data",valid_file)
test_dir = os.path.join(GOOGLE_DRIVE_PATH,"data",test_file)
model_save_dir = os.path.join(GOOGLE_DRIVE_PATH, model_file)

vocabulary = get_vocab(train_dir, valid_dir)
dataset = {
    "train": process_data(train_dir, vocabulary),
    "validation": process_data(valid_dir, vocabulary),
    "test": process_data(test_dir, vocabulary)
}
print(vocabulary)
inv_vocabulary = {v:k for k,v in vocabulary.items()}
# print(inv_vocabulary)




### Parameters

In [9]:
# info = []
# BATCH_SIZE = [5,10,20]
# LEARNING_RATE = [5e-5, 1e-4, 2e-4, 1e-3]
# WEIGHT_DECAY = [0.01,0.005,0.02]
# PLM_MODEL = [("albert","albert-base-v2"),("roberta","roberta-base"),("t5","t5-base"),("t5", "google/t5-v1_1-base"),("bert","bert-base-cased")]


BATCH_SIZE = 25
LEARNING_RATE = 5e-4
WEIGHT_DECAY = 0.02
PLM_MODEL = ("t5","t5-base")
SOFT = True # SoftTemplate and SoftVerbalizer if True else Manual

use_cuda = True

### DataLoader, Verbalizer, Template, Get PLM


In [None]:
plm, tokenizer, model_config, WrapperClass = get_plm(PLM_MODEL)
template_text = get_template()

mytemplate = SoftTemplate(model=plm, tokenizer=tokenizer, text=template_text) if SOFT else ManualTemplate(tokenizer=tokenizer, text=template_text)

train_dataloader = PromptDataLoader(
    dataset=dataset["train"], 
    template=mytemplate, 
    tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, 
    max_seq_length=256, 
    decoder_max_length=3,
    batch_size=BATCH_SIZE,
    shuffle=True, 
    teacher_forcing=False, 
    predict_eos_token=False,
    truncate_method="head"
)
train_dataloader.dataloader.collate_fn = my_collate_fn

validation_dataloader = PromptDataLoader(
    dataset=dataset["validation"], 
    template=mytemplate, 
    tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, 
    max_seq_length=256, 
    decoder_max_length=3,
    batch_size=BATCH_SIZE,
    shuffle=False, 
    teacher_forcing=False, 
    predict_eos_token=False,
    truncate_method="head"
)
validation_dataloader.dataloader.collate_fn = my_collate_fn


test_dataloader = PromptDataLoader(
    dataset=dataset["test"], 
    template=mytemplate, 
    tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, 
    max_seq_length=256, 
    decoder_max_length=3,
    batch_size=BATCH_SIZE,
    shuffle=False, 
    teacher_forcing=False, 
    predict_eos_token=False,
    truncate_method="head"
)
test_dataloader.dataloader.collate_fn = my_collate_fn


label_words = get_verbalizer(vocabulary)


myverbalizer = SoftVerbalizer(tokenizer, plm, vocabulary) if SOFT else ManualVerbalizer(tokenizer, num_classes=len(vocabulary), label_words=label_words,post_log_softmax=False)



### Train

In [None]:
Prompt_Model = PromptForClassification(plm=plm, template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
if use_cuda:
    Prompt_Model = Prompt_Model.cuda()

no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters = [
    {'params': [p for n, p in Prompt_Model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': WEIGHT_DECAY},
    {'params': [p for n, p in Prompt_Model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
]


EPOCH = 5
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)

device = "cuda" if torch.cuda.is_available() else "cpu"
max_f1 = 0.0
max_patience, current_patience, patience_break = 7, 0, False
train_f1_history = []
valid_f1_history = []
train_loss_history = []
valid_loss_history = []
label_loss_history = []

for epoch in range(EPOCH):
    tot_loss = 0.0
    progress = tqdm.tqdm(total=len(train_dataloader), ncols=150, position=0, leave=True,desc="Epoch: "+str(epoch))
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = to_device(inputs, device)
        logits = Prompt_Model(inputs)
        labels = inputs['label']
        label_list = convert_labels_to_list(labels)
        loss = loss_func(logits, label_list)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()

        if step %100 ==99:
            print("\nStep {}, average loss: {}".format(step, tot_loss/(step+1)), flush=True)
            train_loss_history.append(tot_loss/(step+1))
            train_pred_labels = predict(logits)
            _, _, train_f1, _ = evaluation(label_list, train_pred_labels, vocabulary)
            train_f1_history.append(train_f1)
            allpreds, alllabels = [], []
            """Validation"""
            Prompt_Model.eval()
            valid_tot_loss = 0.0
            with torch.no_grad():
                for step, inputs in enumerate(validation_dataloader):
                    if use_cuda:
                        inputs = to_device(inputs, device)
                    logits = Prompt_Model(inputs)
                    labels = inputs['label']
                    label_list = convert_labels_to_list(labels)
                    valid_loss = loss_func(logits, label_list)
                    valid_tot_loss += valid_loss.item()
                    pred_labels = predict(logits)
                    alllabels.extend(label_list)
                    allpreds.extend(pred_labels)
            Prompt_Model.train()

            valid_loss_history.append(valid_tot_loss/len(validation_dataloader))
            label_loss_history.append((epoch,step))

            p, r, f, total = evaluation(alllabels, allpreds, vocabulary)
            print("F1-Score: " + str(f))
            valid_f1_history.append(f)
            with open("results.json", 'w', encoding='utf-8') as f_out:
                f_out.write(json.dumps(total, indent=4))
            if f > max_f1:
                max_f1 = f
                torch.save(Prompt_Model.state_dict(), model_save_dir)
                current_patience = 0
            else:
                current_patience += 1
                if current_patience > max_patience:
                    patience_break = True
                    break
        progress.update(1)
    progress.close()
    if patience_break:
        break

print("F1 score", max_f1)


In [None]:
# 0.7115806258720351  lr=2*1e-4 batch size = 20 weight_decay = 0.01 t5-base   This text describes a {"mask"} event
# 0.7244003467874001  lr=1e-4 batch size = 10 weight_decay = 0.01 t5-large   The previous text describes a {"mask"} event.
# 0.7244364120612574  lr=1e-4 batch size = 10  weight_decay = 0.05 t5-large max_length = 512  The previous text describes the {"mask"} event.



#### Loss & F1-score Plot

In [None]:
assert len(train_loss_history) == len(valid_loss_history)

In [None]:
import matplotlib.pyplot as plt
xi = list(range(len(train_loss_history)))
plt.plot(xi,train_loss_history,  label="train_loss")
plt.plot(xi,valid_loss_history,  label="valid_loss")
plt.xticks(xi,label_loss_history)
plt.ylabel('loss')
plt.legend()
plt.savefig

In [None]:
xi = list(range(len(train_f1_history)))
plt.plot(xi,train_f1_history,  label="train_f1")
plt.plot(xi,valid_f1_history,  label="valid_f1")
plt.xticks(xi,label_loss_history)
plt.ylabel('f1')
plt.legend()
plt.show()

### Dump Results for Test Dataset

In [None]:
### Dumped out the results for test dataset.

    ### You need to write your code here to dump out the dataset using "test_dataloader".
    ### you need to write out all your model predictions into a file "output.json".
    ### Each line of the "output.json" is the model prediction for the sentence.

    ### You may find "inv_vocabulary" useful here.

    ### Each line should be in the following format:
    ###    {"predictions": ["Catastrophe", "Conquering"]}
    ###    {"predictions": ["Social_event"]}
    ###    {"predictions": []}

    ### Note that the sentence order for your output file should be the same with the original file!

In [None]:
test_output_dir = os.path.join(GOOGLE_DRIVE_PATH,"output.json")

use_cuda = True
Prompt_Model = PromptForClassification(plm=plm, template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
if use_cuda:
    Prompt_Model.load_state_dict(torch.load(model_save_dir))
else:
    Prompt_Model.load_state_dict(torch.load(model_save_dir,map_location=torch.device('cpu')))
Prompt_Model.eval()
Prompt_Model.cuda()
result = []
all_pred_test = []
with torch.no_grad():
    for test_input in test_dataloader:
        if use_cuda:
            test_input = to_device(test_input, "cuda")
        logits = Prompt_Model(test_input)
        pred_labels = predict(logits)
        all_pred_test.extend(pred_labels)
        for pred in pred_labels:
            res = {"predictions":[]}
            for p in pred:
                res["predictions"].append(inv_vocabulary[p])
            result.append(res)


with open(test_output_dir, "w") as f:
    for res in result:
        json.dump(res,f)
        f.write('\n')

#### Check on Validataion Set

In [12]:
allpreds, alllabels = [], []
Prompt_Model = PromptForClassification(plm=plm, template=mytemplate, verbalizer=myverbalizer, freeze_plm=False)
Prompt_Model.load_state_dict(torch.load(model_save_dir))
Prompt_Model.eval()
Prompt_Model.cuda()
with torch.no_grad():
    for step, inputs in enumerate(validation_dataloader):
        if use_cuda:
            inputs = to_device(inputs, "cuda")
        logits = Prompt_Model(inputs)
        labels = inputs['label']
        label_list = convert_labels_to_list(labels)
        pred_labels = predict(logits)
        alllabels.extend(label_list)
        allpreds.extend(pred_labels)


p, r, f, total = evaluation(alllabels, allpreds, vocabulary)
print(p,r,f)
print(total)

0.6804308797127468 0.7631896898912606 0.7194381169324222
{'Catastrophe': {'prec': 0.6222222222222222, 'rec': 0.7647951441578149, 'f1': 0.6861810755616066}, 'Causation': {'prec': 0.7677419354838709, 'rec': 0.7689822294022617, 'f1': 0.7683615819209039}, 'Motion': {'prec': 0.6056782334384858, 'rec': 0.46601941747572817, 'f1': 0.5267489711934157}, 'Hostile_encounter': {'prec': 0.6307692307692307, 'rec': 0.721830985915493, 'f1': 0.6732348111658456}, 'Process_start': {'prec': 0.8866666666666667, 'rec': 0.8636363636363636, 'f1': 0.875}, 'Attack': {'prec': 0.695859872611465, 'rec': 0.7959927140255009, 'f1': 0.7425658453695836}, 'Killing': {'prec': 0.8134328358208955, 'rec': 0.8983516483516484, 'f1': 0.8537859007832899}, 'Conquering': {'prec': 0.6445623342175066, 'rec': 0.8073089700996677, 'f1': 0.7168141592920354}, 'Social_event': {'prec': 0.5152173913043478, 'rec': 0.6440217391304348, 'f1': 0.5724637681159421}, 'Competition': {'prec': 0.6118980169971672, 'rec': 0.8470588235294118, 'f1': 0.710

In [None]:
for k, v in total.items():
  print(f"{k} & {round(v['prec'],4)} & {round(v['rec'],4)} & {round(v['f1'],4)} \\")

Catastrophe & 0.6367 & 0.7527 & 0.6898 \
Causation & 0.8097 & 0.6737 & 0.7354 \
Motion & 0.7115 & 0.3592 & 0.4774 \
Hostile_encounter & 0.6602 & 0.6021 & 0.6298 \
Process_start & 0.8585 & 0.8766 & 0.8675 \
Attack & 0.6955 & 0.8197 & 0.7525 \
Killing & 0.8228 & 0.8929 & 0.8564 \
Conquering & 0.6618 & 0.7608 & 0.7079 \
Social_event & 0.5993 & 0.4674 & 0.5252 \
Competition & 0.6451 & 0.7804 & 0.7063 \
