## Machine Reading Comprehension

In [1]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
from transformers import AutoModelForMultipleChoice, Trainer, TrainingArguments
from transformers import AutoTokenizer, DataCollatorWithPadding
import json
import torch.nn as nn
from torch.optim import Adam
import transformers
from torch.utils.data import Dataset, DataLoader
import time
import numpy as np
import pandas as pd

transformers.logging.set_verbosity_error()

In [3]:
seed = 27
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

### Data Preprocessing


In [15]:
# train
f = open("train_HW3dataset.json")
train = json.load(f)
train.sort(key=lambda x: len(x[0][0]))
for row in train:
    for qset in row[1]:
        while len(qset["choice"]) < 4:
            qset["choice"].append("")

with open("train_sorted.json", "w", encoding="utf-8") as jsonfile:
    json.dump(train, jsonfile, ensure_ascii=False)

# valid
f = open("dev_HW3dataset.json")
valid = json.load(f)

valid.sort(key=lambda x: len(x[0][0]))
for row in valid:
    for qset in row[1]:
        while len(qset["choice"]) < 4:
            qset["choice"].append("")


with open("valid_sorted.json", "w", encoding="utf-8") as jsonfile:
    json.dump(valid, jsonfile, ensure_ascii=False)

# test 
f = open("test_HW3dataset.json")
test = json.load(f)

for row in test:
    for qset in row[1]:
        while len(qset["choice"]) < 4:
            qset["choice"].append("")


with open("test.json", "w", encoding="utf-8") as jsonfile:
    json.dump(test, jsonfile, ensure_ascii=False)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese', use_fast=True)

In [6]:
def preprocess_function(data):

  results = []

  for example in data:
    for question_set in example[1]:
      num_choice = len(question_set["choice"])
      article = [example[0] * num_choice]

      question = question_set["question"]
      question_with_option = [[f"{question}[SEP]{choice}"] for choice in question_set["choice"]]
      
      article = sum(article, [])
      question_with_option = sum(question_with_option, [])
      encoded = tokenizer(article, question_with_option, truncation='only_first')

      result = {k: sum([v[i:i+num_choice] for i in range(0, len(v), num_choice)],[]) for k, v in encoded.items()}
      result["label"] = question_set["choice"].index(question_set["answer"])
      results.append(result)
  return results

In [7]:
class QuestionDataset(Dataset):
  def __init__(self, data):
    super().__init__()
    self.data = data
    self.length = len(data)
  def __getitem__(self, index):
    return self.data[index]
  def __len__(self):
    return self.length


In [8]:
f = open("train_sorted.json")
data = json.load(f)

train_set = preprocess_function(data)
train_dataset = QuestionDataset(train_set)

f = open("valid_sorted.json")
data = json.load(f)

valid_set = preprocess_function(data)
valid_dataset = QuestionDataset(valid_set)



In [None]:
print(len(train_dataset[0]["input_ids"])) # input_ids: [4, len()], label: [1]

4


### Training

In [9]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in train_dataset[i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [None]:
[tokenizer.decode(batch["input_ids"][1][i].tolist()) for i in range(4)]

['[CLS] 王 红 在 给 李 老 师 打 电 话 。 [SEP] 王 红 在 做 什 么? [SEP] 买 东 西 [SEP] [PAD] [PAD]',
 '[CLS] 王 红 在 给 李 老 师 打 电 话 。 [SEP] 王 红 在 做 什 么? [SEP] 看 电 视 [SEP] [PAD] [PAD]',
 '[CLS] 王 红 在 给 李 老 师 打 电 话 。 [SEP] 王 红 在 做 什 么? [SEP] 打 电 话 [SEP] [PAD] [PAD]',
 '[CLS] 王 红 在 给 李 老 师 打 电 话 。 [SEP] 王 红 在 做 什 么? [SEP] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]']

### Hyperparameters

In [10]:
batch_size = 2
epochs = 4
learning_rate = 1e-5 #1e-5

### Build Model

In [11]:
model = AutoModelForMultipleChoice.from_pretrained("bert-base-chinese", num_labels=4)

training_args = TrainingArguments(
    output_dir="MRC_MODEL", 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01
)

Downloading:   0%|          | 0.00/412M [00:00<?, ?B/s]

In [12]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

### Inference

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model = AutoModelForMultipleChoice.from_pretrained('/content/drive/MyDrive/MRC_MODEL/checkpoint-12028')
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/MRC_MODEL/checkpoint-12028', use_fast=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
f = open("test.json")
test = json.load(f)

prediction = {"answer": []}
with torch.no_grad():
  model.eval()
  for example in test:
    for question_set in example[1]:
      question = question_set["question"]
      inputs = tokenizer([[example[0][0], f"{question}[SEP]{choice}"] for choice in question_set["choice"]],padding=True, return_tensors="pt", truncation='only_first')
      labels = torch.tensor(0).unsqueeze(0)
      inputs = inputs.to(device)
      labels = labels.to(device)
      outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
      logits = outputs.logits
      predict_class = logits.argmax().item()
      prediction["answer"].append(int(predict_class)+1)

df = pd.DataFrame(prediction)
df.to_csv("submission.csv", index_label="index")

In [None]:
# !cp -r /content/MRC_MODEL/ /content/drive/MyDrive/