# AI CUP 2022: Argument Detection (Models)
Meng-Chieh, Liu  
2022/11/28

## Model History



|version | Descriptions  | train, valid, test | main task weight | model threshold | valid score | test score | public score|
|-------- |------- |-----  | --------|-------- |-------- |-------- |-------- |
|v3|LCS/sentence>0.7|8:1:1|2|0.2|0.6999|0.7031|0.7941|
|v4|LCS/sentence>0.7, maximun LCS/sentence, return short|8:1:1|2.5|0.25|0.7007|0.7027|0.8059|
|v5|ranked by LCS/r(q), cumulated to 0.8|8:1:1|2.5|0.28|0.6977|0.7000|0.7943|
|v6|ranked by LCS/r(q), cumulated to 0.9|8:1:1|2.5|0.3|0.6987|0.6992|X|
|v7|LCS/sentence > 0.75, , maximun LCS/r(q)|8:1:1|2.5|0.29|0.7000|0.7049|X|
|v8 (tokens>25)|LCS/sentence>0.7, maximun LCS/sentence, return short|9:0:1|2.5|0.29|0.5608|X|0.8158|

## Import

In [None]:
version = 'v8'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q torch pytorch-lightning
!pip install -q transformers
!pip install -q nltk==3.7

[K     |████████████████████████████████| 798 kB 12.0 MB/s 
[K     |████████████████████████████████| 512 kB 9.3 MB/s 
[K     |████████████████████████████████| 125 kB 72.5 MB/s 
[K     |████████████████████████████████| 87 kB 6.4 MB/s 
[?25h  Building wheel for fire (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 5.8 MB 31.7 MB/s 
[K     |████████████████████████████████| 7.6 MB 50.0 MB/s 
[K     |████████████████████████████████| 182 kB 340 kB/s 
[?25h

In [None]:
# general purpose
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

# Huggingface transformers
import transformers
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

# pytorch
import torch
from torch import nn, cuda
from torch.utils.data import DataLoader, Dataset

# pytorch lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
punctuations = '''!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~'''


RANDOM_SEED = 666
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# 評分用LCS
def LCS_Score(text1: str, text2: str) -> int:

    text1 = [i for i in word_tokenize(text1) if len(i)>1 or i not in punctuations]
    text2 = [i for i in word_tokenize(text2) if len(i)>1 or i not in punctuations]

    if len(text2) > len(text1):
        text1, text2 = text2, text1
    lcs = [[0]*(len(text2)+1) for _ in range(2)]
    for i in range(1, len(text1)+1):
        for j in range(1, len(text2)+1):
            if text1[i-1]== text2[j-1]:
                lcs[i%2][j] = lcs[(i-1) % 2][j-1] +1
            else:
                lcs[i%2][j]= max(lcs[(i-1)%2][j], lcs[i % 2][j-1])
    lcs = lcs[len(text1)% 2][len(text2)]
    return  lcs / (len(text1) + len(text2) - lcs)

## Preprocessing (mainly in other notebook)
remember to revise data path


### load preprocessed data

In [None]:
# Load
with open(f'/content/drive/Shareddrives/AI_CUP_NLP/data_{version}/new_df.pickle', 'rb') as f:
    new_df = pickle.load(f)

### split train, test, val

In [None]:
index = new_df.index.unique()
train_index, test_index = train_test_split(index, test_size=0.1, random_state=RANDOM_SEED, shuffle=True)

In [None]:
def x_y_split(df_index, new_df, train=False):
  df = new_df.loc[df_index]
  if train:
    df = shuffle(df, random_state=RANDOM_SEED)
  X = df[['sentence','q','r','q_length','r_length','is_q']]
  y = df[['label','s']]
  return X, y

In [None]:
X_train, y_train = x_y_split(train_index, new_df, train=True)
X_val, y_val = x_y_split(test_index, new_df)
X_test, y_test = x_y_split(test_index, new_df)

In [None]:
X_train.shape, X_val.shape, X_test.shape

((49345, 6), (5471, 6), (5471, 6))

## Model

### parameters

In [None]:
# Initialize the parameters that will be use for training
N_EPOCHS = 20
BATCH_SIZE = 6
STEPS_PER_EPOCH = len(X_train)//BATCH_SIZE
MAX_LEN = 512
LR = 2e-4
DROPOUT_RATE = 0.1

In [None]:
BERT_MODEL_NAME = "bert-base-uncased" # we will use the BERT base model(the smaller one)
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

### model class

In [None]:
class bertDataset (Dataset):
    def __init__(self, X, y, tokenizer, max_len=MAX_LEN):
        self.tokenizer = tokenizer
        self.q = list(X["q"])
        self.r = list(X["r"])
        self.sentence = list(X["sentence"])
        self.length = len(self.sentence)
        self.features = torch.FloatTensor(np.array(X[['q_length', 'r_length', 'is_q']], dtype=np.float32))
        self.label = torch.LongTensor(np.array(y['label'], dtype=np.int16).reshape(self.length, 1))
        self.s = torch.LongTensor(np.array(y['s'], dtype=np.int16).reshape(self.length, 1))
        self.max_len = max_len
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, item_idx):
        sentence_q = self.tokenizer.encode_plus(
            self.sentence[item_idx],
            self.q[item_idx],
            add_special_tokens = True,
            max_length= self.max_len,
            padding = 'max_length',
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'
          )
        
        sentence_r = self.tokenizer.encode_plus(
            self.sentence[item_idx],
            self.r[item_idx],
            add_special_tokens=True,
            max_length= self.max_len,
            padding = 'max_length',
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'
          )
    
        return {
            'sentence_q': (sentence_q['input_ids'].flatten(), sentence_q['attention_mask'].flatten(), sentence_q['token_type_ids'].flatten()),
            'sentence_r': (sentence_r['input_ids'].flatten(), sentence_r['attention_mask'].flatten(), sentence_r['token_type_ids'].flatten()),
            'features' : self.features[item_idx],
            'label' : self.label[item_idx],
            's' : self.s[item_idx]
        }

In [None]:
class bertDataModule (pl.LightningDataModule):
    
    def __init__(self, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test, tokenizer=tokenizer, batch_size=BATCH_SIZE, max_token_len=MAX_LEN):
        super().__init__()
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.X_test = X_test
        self.y_test = y_test
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = bertDataset(X=self.X_train, y=self.y_train, tokenizer=self.tokenizer, max_len=self.max_token_len)
        self.val_dataset  = bertDataset(X=self.X_val, y=self.y_val, tokenizer=self.tokenizer, max_len=self.max_token_len)
        self.test_dataset  = bertDataset(X=self.X_test, y=self.y_test, tokenizer=self.tokenizer, max_len=self.max_token_len)
        
    def train_dataloader(self):
        return DataLoader (self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        return DataLoader (self.val_dataset, batch_size=self.batch_size, num_workers=4)

    def test_dataloader(self):
        return DataLoader (self.test_dataset,batch_size=self.batch_size, num_workers=4)

In [None]:
class bertClassifier(pl.LightningModule):
    # Set up the classifier
    def __init__(self, lr=LR, dropout_rate=DROPOUT_RATE, maxLength=MAX_LEN, steps_per_epoch=STEPS_PER_EPOCH, n_epochs=N_EPOCHS):
        super().__init__()

        self.bert1 = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.bert2 = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.lr = lr
        self.fc_task1 = nn.Sequential(
            nn.Linear(768*3+3, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 2)
        )

        self.fc_task2 = nn.Sequential(
            nn.Linear(768*3+3, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 2)
        )
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.criterion = nn.CrossEntropyLoss()


    def forward(self, input_ids1, attention_mask1, token_type_ids1, input_ids2, attention_mask2, token_type_ids2, features):
        sentence_q = self.bert1(input_ids=input_ids1, attention_mask=attention_mask1, token_type_ids=token_type_ids1).pooler_output
        sentence_r = self.bert2(input_ids=input_ids2, attention_mask=attention_mask2, token_type_ids=token_type_ids2).pooler_output
        logits = torch.cat([sentence_q, sentence_r, sentence_q*sentence_r, features], 1)
        logits1 = self.fc_task1(logits)
        logits2 = self.fc_task2(logits)
        return logits1, logits2
    
    
    def training_step(self, batch, batch_idx):
        input_ids1, attention_mask1, token_type_ids1  = batch['sentence_q']
        input_ids2, attention_mask2, token_type_ids2  = batch['sentence_r']
        
        features = batch['features']
        label = batch['label'].squeeze(1)
        s = batch['s'].squeeze(1)

        logits1, logits2 = self.forward(input_ids1, attention_mask1, token_type_ids1, input_ids2, attention_mask2, token_type_ids2, features)
        loss = self.criterion(logits1, label)*2.5 + self.criterion(logits2, s)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss


    def validation_step(self, batch, batch_idx):
        input_ids1, attention_mask1, token_type_ids1  = batch['sentence_q']
        input_ids2, attention_mask2, token_type_ids2  = batch['sentence_r']

        features = batch['features']
        label = batch['label'].squeeze(1)
        s = batch['s'].squeeze(1)

        logits1, logits2 = self.forward(input_ids1, attention_mask1, token_type_ids1, input_ids2, attention_mask2, token_type_ids2, features)
        loss = self.criterion(logits1, label)*2.5 + self.criterion(logits2, s)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss


    def test_step(self, batch, batch_idx):
        input_ids1, attention_mask1, token_type_ids1  = batch['sentence_q']
        input_ids2, attention_mask2, token_type_ids2  = batch['sentence_r']

        features = batch['features']
        label = batch['label'].squeeze(1)
        s = batch['s'].squeeze(1)

        logits1, logits2 = self.forward(input_ids1, attention_mask1, token_type_ids1, input_ids2, attention_mask2, token_type_ids2, features)
        loss = self.criterion(logits1, label)*2.5 + self.criterion(logits2, s)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss    
    

    def configure_optimizers(self):
        # optimizer: AdamW
        optimizer = torch.optim.AdamW(self.parameters() , lr=self.lr)

        # scheduler: get_linear_schedule_with_warmup
        warmup_steps = self.steps_per_epoch//3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps
        scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,total_steps)
        return [optimizer], [scheduler]

## Train
remember to revise checkpoint path and default root path

In [None]:
# to ignore warnings
transformers.logging.set_verbosity_error()

In [None]:
resume_from_checkpoint = "/content/drive/Shareddrives/AI_CUP_NLP/lightning_logs/version_2/checkpoints/epoch=07-val_loss=1.844.ckpt"

In [None]:
# Instantiate and set up the data_module
bert_data_module = bertDataModule()
bert_data_module.setup()

In [None]:
model = bertClassifier()

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# freeze bert layers
for param in model.bert1.embeddings.parameters():
    param.requires_grad = False
for param in model.bert1.encoder.layer[:10].parameters():
    param.requires_grad = False
for param in model.bert2.embeddings.parameters():
    param.requires_grad = False
for param in model.bert2.encoder.layer[:10].parameters():
    param.requires_grad = False

In [None]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    filename='{epoch:02d}-{val_loss:.3f}',
    save_top_k=3, 
    mode='min'
)

earlyStopping_callback = pl.callbacks.early_stopping.EarlyStopping(
    monitor="val_loss", 
    mode="min", 
    patience=3)

In [None]:
# Instantiate the Model Trainer
trainer = pl.Trainer(
    max_epochs=N_EPOCHS, 
    accelerator='gpu', 
    devices=1, 
    callbacks=[checkpoint_callback, earlyStopping_callback], 
    default_root_dir='/content/drive/Shareddrives/AI_CUP_NLP',
    resume_from_checkpoint=resume_from_checkpoint)

  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, bert_data_module)

  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/drive/Shareddrives/AI_CUP_NLP/lightning_logs/version_2/checkpoints/epoch=07-val_loss=1.844.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | bert1     | BertModel        | 109 M 
1 | bert2     | BertModel        | 109 M 
2 | fc_task1  | Sequential       | 1.2 M 
3 | fc_task2  | Sequential       | 1.2 M 
4 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
31.9 M    Trainable params
189 M     Non-trainable params
221 M     Total params
885.320   Total estimated model params size (MB)
INFO:pytorch_lightning.utilities.rank_zero:Restored all states from the checkpoint file at /content/drive/Shareddrives/AI_CUP_NLP/lightning_logs/version_2/checkpoints/epoch=

Sanity Checking: 0it [00:00, ?it/s]

Training: 8225it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Visualize

In [None]:
# import tensorboard
# %load_ext tensorboard
# %tensorboard --logdir /content/drive/Shareddrives/AI_CUP_NLP/lightning_logs

## Valid/Test
remember to revise model path

### load model

In [None]:
model_path = "/content/drive/Shareddrives/AI_CUP_NLP/lightning_logs/version_3/checkpoints/epoch=13-val_loss=1.761.ckpt"

In [None]:
model = model.load_from_checkpoint(model_path)
model.eval()
model.to(device)

### predict function

In [None]:
def predict(df, dataloader):

  with torch.no_grad():

    softmax = nn.Softmax()

    label_predict = torch.Tensor().to(device)
    s_predict = torch.Tensor().to(device)

    for i, batch in enumerate(tqdm(dataloader)):
      input_ids1, attention_mask1, token_type_ids1  = batch['sentence_q']
      input_ids2, attention_mask2, token_type_ids2  = batch['sentence_r']

      features = batch['features']
      label = batch['label'].squeeze(1)
      s = batch['s'].squeeze(1)

      logits1, logits2 = model(input_ids1.to(device), attention_mask1.to(device), token_type_ids1.to(device),
                    input_ids2.to(device), attention_mask2.to(device), token_type_ids2.to(device), features.to(device))
      logits1 = softmax(logits1)
      logits2 = softmax(logits2)

      label_predict = torch.concat([label_predict, logits1])
      s_predict = torch.concat([s_predict, logits2])


  label_predict_np = label_predict.to('cpu').numpy()
  s_predict_np = s_predict.to('cpu').numpy()

  
  df['label_0'] = label_predict_np[:,0]
  df['label_1'] = label_predict_np[:,1]  # we olny evaluate on this
  df['s_0'] = s_predict_np[:,0]
  df['s_1'] = s_predict_np[:,1]

  return df

### val

In [None]:
val_df = new_df.loc[test_index]
val_dataloader = bert_data_module.val_dataloader()
val_result = predict(val_df, val_dataloader)

### find threshold

In [None]:
# evaluate according to LCS overlap rate

def evaluate(df, threshold=0.5):

  score_list = []
  ids = df.index.unique()

  for id in ids:

    try:
      data = df.loc[id]
      # q
      q = data[data['is_q']==1].reset_index()
      q_answer = q["q'"][0]
      if len(q)==1:
        q_predict = q["q"][0]
      else:
        q = q[q['label_1']>=threshold]
        q_predict = " ".join(q['sentence'])
      
      q_score = LCS_Score(q_answer, q_predict)

      # r
      r = data[data['is_q']==0].reset_index()
      r_answer = r["r'"][0]
      if len(r)==1:
        r_predict = r["r"][0]
      else:
        r = r[r['label_1']>=threshold]
        r_predict = " ".join(r['sentence'])
      
      r_score = LCS_Score(r_answer, r_predict)

      # last
      score = (q_score+r_score)/2
      score_list.append(score)

    except:
      pass

  final_score = sum(score_list)/len(score_list)
  return final_score

In [None]:
# find the best threshold with highest score

best_score = 0
best_threshold = 0

for threshold in tqdm(range(28, 33, 1)):
  threshold /= 100
  try:
    final_score = evaluate(val_result, threshold)
    if final_score > best_score:
      best_score = final_score
      best_threshold = threshold
  except:
    pass

100%|██████████| 5/5 [00:20<00:00,  4.02s/it]


In [None]:
best_score, best_threshold

(0.5607916137004386, 0.29)

### test

In [None]:
test_df = new_df.loc[test_index]
test_dataloader = bert_data_module.test_dataloader()
test_result = predict(test_df, test_dataloader)

In [None]:
evaluate(test_result, best_threshold)