In [1]:
!pip install transformers==4.5.0

Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
Collecting regex!=2019.12.17
  Downloading regex-2022.1.18-cp36-cp36m-win_amd64.whl (272 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp36-cp36m-win_amd64.whl (2.0 MB)
Collecting filelock
  Downloading filelock-3.4.1-py3-none-any.whl (9.9 kB)
Collecting joblib
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Installing collected packages: regex, joblib, tokenizers, sacremoses, filelock, transformers
Successfully installed filelock-3.4.1 joblib-1.1.0 regex-2022.1.18 sacremoses-0.0.47 tokenizers-0.10.3 transformers-4.5.0


In [21]:
import torch
import numpy as np
import random
import json
from torch.utils.data import Dataset, DataLoader
from transformers import BertForQuestionAnswering, BertTokenizerFast, AdamW
from tqdm.auto import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def same_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

same_seeds(0)

In [41]:
fp16_training = True

if fp16_training:
    !pip install accelerate==0.2.0
    from accelerate import Accelerator
    accelerator = Accelerator(fp16 = True)
    device = accelerator.device

In [1]:
model = BertForQuestionAnswering.from_pretrained('luhua/chinese_pretrain_mrc_roberta_wwm_ext_large').to(device)
tokenizer = BertTokenizerFast.from_pretrained('luhua/chinese_pretrain_mrc_roberta_wwm_ext_large')

NameError: name 'BertForQuestionAnswering' is not defined

In [32]:
import os

def read_data(file):
    with open(file, 'r', encoding = 'utf-8') as reader:
        data = json.load(reader)
    return data['questions'], data['paragraphs']

filename = './ml2021-spring-hw7'
train_questions, train_paragraphs = read_data(os.path.join(filename, 'hw7_train.json'))
dev_questions, dev_paragraphs = read_data(os.path.join(filename, 'hw7_dev.json'))
test_questions, test_paragraphs = read_data(os.path.join(filename, 'hw7_test.json'))

In [33]:
train_questions_tokenized = tokenizer([question['question_text'] for question in train_questions], add_special_tokens = False)
dev_questions_tokenized = tokenizer([question['question_text'] for question in dev_questions], add_special_tokens = False)
test_questions_tokenized = tokenizer([question['question_text'] for question in test_questions], add_special_tokens = False)

train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens = False)
dev_paragraphs_tokenized = tokenizer(dev_paragraphs, add_special_tokens = False)
test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens = False)

In [34]:
class QADataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 40
        self.max_paragraph_len = 150
        self.doc_stride = 30
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1
    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question['paragraph_id']]
        
        if self.split == 'train':
            answer_start_token = tokenized_paragraph.char_to_token(question['answer_start'])
            answer_end_token = tokenized_paragraph.char_to_token(question['answer_end'])
            
            # middle
            # mid = (answer_start_token + answer_end_token) // 2
            paragraph_start = max(0, min(random.randint(answer_end_token - self.max_paragraph_len + 1, answer_start_token), len(tokenized_paragraph) - self.max_paragraph_len))
            paragraph_end = paragraph_start + self.max_paragraph_len

            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start:paragraph_end] + [102]
            
            answer_start = answer_start_token + len(input_ids_question) - paragraph_start
            answer_end = answer_end_token + len(input_ids_question) - paragraph_start
            
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start, answer_end
        
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i:i+self.max_paragraph_len] + [102]
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
                
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)
        
    def padding(self, input_ids_question, input_ids_paragraph):
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        return input_ids, token_type_ids, attention_mask

In [35]:
train_set = QADataset('train', train_questions, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QADataset('dev', dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QADataset('test', test_questions, test_questions_tokenized, test_paragraphs_tokenized)

train_batch_size = 4
train_loader = DataLoader(train_set, batch_size = train_batch_size, shuffle = True, pin_memory = True)
dev_loader = DataLoader(dev_set, batch_size = 1, shuffle = False, pin_memory = True)
test_loader = DataLoader(test_set, batch_size = 1, shuffle = False, pin_memory = True)

In [36]:
def evaluate(data, output):
    answer = ''
    max_prob = float('-inf')
    num_windows = data[0].shape[1]
    
    for k in range(num_windows):
        start_prob, start_index = torch.max(output.start_logits[k], dim=0)
        end_prob, end_index = torch.max(output.end_logits[k], dim=0)
        
        prob = start_prob + end_prob
        
        if prob > max_prob:
            max_prob = prob
            
            if start_index > end_index:
                answer = ''
            else:
                answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])
    
    return answer.replace(' ', '')

In [37]:
from torch.optim.lr_scheduler import LambdaLR

def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda, last_epoch)

In [42]:
num_epochs = 1
logging_step = 100
validation = True
learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr = learning_rate)
accum_iter = 4
scheduler = get_linear_schedule_with_warmup(optimizer, 0, 1684)

if fp16_training:
    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

model.train()
print("Start Training ...")

for epoch in range(num_epochs):
    step = 1
    train_loss = train_acc = 0.0
    for batch_idx, data in enumerate(tqdm(train_loader)):
        data = [i.to(device) for i in data]
        output = model(input_ids = data[0], token_type_ids = data[1], attention_mask = data[2], start_positions = data[3], end_positions = data[4])
        start_index = torch.argmax(output.start_logits, dim=1)
        end_index = torch.argmax(output.end_logits, dim=1)
        train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
        train_loss += output.loss.item()
        
        loss = output.loss / accum_iter
        
        if fp16_training:
            accelerator.backward(loss)
        else:
            loss.backward()
        
        if ((batch_idx + 1) % accum_iter == 0) or (batch_idx + 1 == len(train_loader)):
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
        
        if step % logging_step == 0:
            print(f'Epoch {epoch + 1} | Step {step} | loss = {train_loss / logging_step : .3f}, acc = {train_acc / logging_step : .3f}')
            train_acc = train_loss = 0.0
        
        step += 1
    
    if validation:
        print("Evaluating Dev Set ...")
        model.eval()
        dev_acc = 0.0
        with torch.no_grad():
            for i, data in enumerate(tqdm(dev_loader)):
                output = model(input_ids = data[0].squeeze(dim=0).to(device), token_type_ids = data[1].squeeze(dim=0).to(device), attention_mask = data[2].squeeze(dim=0).to(device))
                dev_acc += evaluate(data, output) == dev_questions[i]['answer_text']
            print(f'Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader) : .3f}')
        model.train()
        
print("Saving Model ...")
model_save_dir = 'saved_model'
model.save_pretrained(model_save_dir)

Start Training ...


Widget Javascript not detected.  It may not be installed or enabled properly.


RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:76] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3162112 bytes.

In [13]:
print("Evaluating Test Set ...")
model.eval()

result = []

with torch.no_grad():
    for data in tqdm(test_loader):
        output = model(input_ids = data[0].squeeze(dim=0).to(device), token_type_ids = data[1].squeeze(dim=0).to(device), attention_mask = data[2].squeeze(dim=0).to(device))
        result.append(evaluate(data, output))

result_file = 'result.csv'
with open(result_file, 'w', encoding = 'utf-8') as f:
    f.write('Id,Answer\n')
    for i in range(len(result)):
        f.write(f"{test_questions[i]['id']},{result[i].replace(',', '')}\n")
        
print(f"Completed! Result is in {result_file}")

Evaluating Test Set ...


Widget Javascript not detected.  It may not be installed or enabled properly.



Completed! Result is in result.csv


In [18]:
a = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8 ,9]).reshape(3, 3)
print(a)

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])


In [20]:
b = a.view(-1)
print(b.shape)

torch.Size([9])


In [15]:
x = np.random.random()

In [16]:
print(x)

0.5488135039273248


In [20]:
device = 'cpu'