In [None]:
import torch
import os
import random
import numpy as np

config = {
    'train_file_path': '/content/drive/MyDrive/train.json',
    'dev_file_path': '/content/drive/MyDrive/dev.json',
    'test_file_path': '/content/drive/MyDrive/test.json',
    'output_path': '/content/drive/MyDrive/output',
    'model_path': '/content/drive/MyDrive/BERT_model',
    'batch_size': 64,
    'num_epoches': 1,
    'max_seq_len': 64,
    'learning_rate': 2e-5,
    'eps': 0.1,
    'alpha': 0.3,
    'adv': 'fgm',
    'warmup_ratio': 0.05,
    'weight_decay': 0.01,
    'use_bucket': True,
    'bucket_multiplier': 200,
    'device': 'cuda',
    'n_gpus': 0,
    'use_amp': True,
    'logging_step': 300,
    'ema_start_step': 500,
    'ema_start': False,
    'seed': 2021
}

if not torch.cuda.is_available():
  config['device'] = 'cpu' 
else:
  config['n_gpus'] = torch.cuda.device_count()
  config['batch_size'] *= config['n_gpus']

if not os.path.exists(config['output_path']):
  os.makedirs((config['output_path']))

def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

seed_everything(config['seed'])


In [None]:
from tqdm import tqdm
import json
import pandas as pd

def parse_data(path, data_type='train'):
  sentence_a = []
  sentence_b = []
  labels = []
  with open(path, 'r', encoding='utf8') as f:
    for line in tqdm(f.readlines(), desc=f'Reading {data_type} data'):
      line = json.loads(line)
      sentence_a.append(line['sentence1'])
      sentence_b.append(line['sentence2'])
      if data_type!='test':
        labels.append(int(line['label']))
      else:
        labels.append(0)

  df = pd.DataFrame(zip(sentence_a,sentence_b,labels), columns=['text_a', 'text_b', 'labels'])
  return df

In [None]:
def build_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer):
  inputs_dict = tokenizer.encode_plus(sentence_a, sentence_b, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True)
  inputs['input_ids'].append(inputs_dict['input_ids'])
  inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
  inputs['attention_mask'].append(inputs_dict['attention_mask'])
  inputs['labels'].append(label)


In [None]:
from collections import defaultdict
def read_data(config, tokenizer):
  train_df = parse_data(config['train_file_path'], data_type='train')
  dev_df = parse_data(config['dev_file_path'], data_type='dev')
  test_df = parse_data(config['test_file_path'], data_type='test')

  data_df = {'train': train_df, 'dev': dev_df, 'test': test_df}

  processed_data = {}

  for data_type, df in data_df.items():
    inputs = defaultdict(list)
    for i, row in tqdm(df.iterrows(), desc=f'Preprocessing {data_type} data', total=len(df)):
      label = row[2]
      sentence_a, sentence_b = row[0], row[1]
      build_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer)

    processed_data[data_type] = inputs
  
  return processed_data

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(config['model_path'])

dt = read_data(config, tokenizer)

Reading train data: 100%|██████████| 34334/34334 [00:00<00:00, 225174.91it/s]
Reading dev data: 100%|██████████| 4316/4316 [00:00<00:00, 192829.24it/s]
Reading test data: 100%|██████████| 3861/3861 [00:00<00:00, 226695.33it/s]
Preprocessing train data: 100%|██████████| 34334/34334 [00:24<00:00, 1424.02it/s]
Preprocessing dev data: 100%|██████████| 4316/4316 [00:03<00:00, 1432.20it/s]
Preprocessing test data: 100%|██████████| 3861/3861 [00:02<00:00, 1415.03it/s]


In [None]:
print(dt['train']['input_ids'][0])

[101, 6010, 6009, 955, 1446, 5023, 7583, 6820, 3621, 1377, 809, 2940, 2768, 1044, 2622, 1400, 3315, 1408, 102, 955, 1446, 3300, 1044, 2622, 1168, 3309, 6820, 3315, 1408, 102]


In [None]:
from torch.utils.data import Dataset
class AFQMCDataset(Dataset):
  def __init__(self, data_dict):
    super(AFQMCDataset, self).__init__()
    self.data_dict = data_dict

  def __getitem__(self, idx):
    # ---------------------------------------------#
    data = (self.data_dict['input_ids'][idx],
            self.data_dict['token_type_ids'][idx],
            self.data_dict['attention_mask'][idx],
            self.data_dict['labels'][idx])
    return data
    # ---------------------------------------------#

  def __len__(self):
    return len(self.data_dict['input_ids'])

In [None]:
class Collator():
  def __init__(self, max_seq_len, tokenizer):
    self.max_seq_len = max_seq_len
    self.tokenizer = tokenizer

  def pad_and_truncate(self, input_ids_list, token_type_ids_list, attention_mask_list, labels_list, max_seq_len):
    input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long)
    token_type_ids = torch.zeros_like(input_ids)
    attention_mask = torch.zeros_like(input_ids)

    for i in range(len(input_ids_list)):
      seq_len = len(input_ids_list[i])

      if seq_len <= max_seq_len:
        input_ids[i,:seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long)
        token_type_ids[i,:seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
        attention_mask[i,:seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)
      else:
        input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len-1] + [self.tokenizer.sep_token_id], dtype=torch.long)
        token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long)
        attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long)

    labels = torch.tensor(labels_list, dtype=torch.long)
    return input_ids, token_type_ids, attention_mask, labels

  def __call__(self, examples):
    input_ids_list, token_type_ids_list, attention_mask_list, labels_list = list(zip(*examples))
    cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
    max_seq_len = min(cur_max_seq_len, self.max_seq_len)

    input_ids, token_type_ids, attention_mask, labels = self.pad_and_truncate(input_ids_list, 
                                                                              token_type_ids_list, attention_mask_list, 
                                                                              labels_list, max_seq_len)
    
    data_dict = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

    return data_dict
                        

In [None]:
collate_fn = Collator(config['max_seq_len'], tokenizer)

In [None]:
from bucket_sampler import BucketBatchSampler
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler

def build_dataloader(config, data, collate_fn):
  train_dataset = AFQMCDataset(data['train'])
  dev_dataset = AFQMCDataset(data['dev'])
  test_dataset = AFQMCDataset(data['test'])

  if config['use_bucket']:
    train_sampler = RandomSampler(train_dataset)
    bucket_sampler = BucketBatchSampler(train_sampler, batch_size=config['batch_size'],
                                        drop_last=False, sort_key=lambda x:len(train_dataset[x][0]),
                                        bucket_size_multiplier=config['bucket_multiplier']
                                       )
    train_dataloader = DataLoader(dataset=train_dataset, batch_sampler=bucket_sampler,
                                  num_workers=4, collate_fn=collate_fn)
  else:
    train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'],
                                  shuffle=True, num_workers=4, collate_fn=collate_fn)
 
  dev_dataloader = DataLoader(dev_dataset, batch_size=config['batch_size'],
                                  shuffle=False, num_workers=4, collate_fn=collate_fn)
  test_dataloader = DataLoader(test_dataset, batch_size=config['batch_size'],
                                  shuffle=False, num_workers=4, collate_fn=collate_fn) 
  return train_dataloader, dev_dataloader, test_dataloader

In [None]:
train_dataloader, dev_dataloader, test_dataloader = build_dataloader(config, dt, collate_fn)

  cpuset_checked))


In [None]:
for i in train_dataloader:
  print(i)
  break

  cpuset_checked))


{'input_ids': tensor([[ 101, 2376, 2769,  ..., 6820, 7178,  102],
        [ 101, 5709, 1446,  ..., 3309, 1408,  102],
        [ 101, 4385, 1762,  ...,  749, 1435,  102],
        ...,
        [ 101, 6010, 6009,  ..., 4500, 1905,  102],
        [ 101,  955, 1446,  ..., 3621, 1408,  102],
        [ 101,  711,  784,  ...,  115,  115,  102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        1, 0, 1

In [None]:
from sklearn.metrics import f1_score,accuracy_score
def evaluation(model, config, val_dataloader):
  model.eval()
  preds = []
  labels = []
  val_loss = 0.
  val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))

  with torch.no_grad():
    for batch in val_iterator:  
      # ---------------------------------------------#
      labels.append(batch['labels'])
      batch_cuda = {item: value.to(config['device']) for item, value in list(batch.items())}

      loss, logits = model(**batch_cuda)[:2]
      if config['n_gpus'] > 1:
        loss = loss.mean()
      # ---------------------------------------------#
      val_loss += loss.item()
      
      preds.append(logits.argmax(dim=-1).detach().cpu())

  avg_val_loss = val_loss/len(val_dataloader)
  labels = torch.cat(labels, dim=0).numpy()
  preds = torch.cat(preds, dim=0).numpy()

  f1 = f1_score(labels, preds, average='macro')
  acc = accuracy_score(labels, preds)

  return avg_val_loss, f1, acc


In [None]:
from types import new_class
class EMA:
  def __init__(self, model, decay):
    self.model = model
    self.decay = decay
    self.shadow = {}
    self.backup = {}
    self.register()

  def register(self):
    for name, param in self.model.named_parameters():
      if param.requires_grad:
        self.shadow[name] = param.data.clone()

  def update(self):
    for name, param in self.model.named_parameters():
      if param.requires_grad:
        assert name in self.shadow
        new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
        self.shadow[name] = new_average.clone()

  def apply_shadow(self):
    for name, param in self.model.named_parameters():
      if param.requires_grad:
        assert name in self.shadow
        self.backup[name] = param.data
        param.data = self.shadow[name]

  def resrore(self):
    for name, param in self.model.named_parameters():
      if param.requires_grad:
        assert name in self.backup
        param.data = self.backup[name]
    self.backup = {}
                                            

In [None]:
from extra_loss import *
from extra_optim import *
from extra_fgm import *
from extra_pgd import *
from transformers import AdamW, BertForSequenceClassification
from torch.cuda import amp
from tqdm import trange
def train(config, train_dataloader, dev_dataloader):

  model = BertForSequenceClassification.from_pretrained(config['model_path'])

  param_optimizer = list(model.named_parameters())

  # 实例化 GradScaler 对象
  scaler = amp.GradScaler(enabled=config['use_amp'])

  no_decay = ['bias', 'LayerNorm.weight']

  optimizer_grouped_parameters = [
    {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     "weight_decay": config['weight_decay']},
    {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     "weight_decay": 0.0}
  ]

  optimizer = AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=1e-8)
  optimizer = Lookahead(optimizer, 5, 1)
  total_steps = config['num_epoches'] * len(train_dataloader)

  lr_scheduler = WarmupLinearSchedule(optimizer,
                                      warmup_steps=int(config['warmup_ratio']*total_steps),
                                      t_total=total_steps)

  model.to(config['device'])

  if config['adv'] == 'fgm':
    fgm = FGM(model)
  else:
    pgd = PGD(model)
    K = 3

  epoches_iterator = trange(config['num_epoches'])

  global_steps = 0
  train_loss = 0.
  logging_loss = 0.
  best_acc = 0.
  best_model_path = ''

  if config['n_gpus'] > 1:
    model = nn.DataParallel(model)

  for epoch in epoches_iterator:
    train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
    model.train()
    
    for batch in train_iterator:
      batch_cuda = {item: value.to(config['device']) for item, value in batch.items()}

      with amp.autocast(enabled=config['use_amp']):
        loss = model(**batch_cuda)[0]
        if config['n_gpus'] > 1:
          loss = loss.mean()

      scaler.scale(loss).backward()

      if config['adv'] == 'fgm':
        fgm.attack(epsilon=config['eps'])
          
        with amp.autocast(enabled=config['use_amp']):
          loss_adv = model(**batch_cuda)[0]
          if config['n_gpus'] > 1:
            loss_adv = loss_adv.mean()

        scaler.scale(loss_adv).backward()
        fgm.restore()
      else:
        pgd.backup_grad()
        for t in range(K):
          pgd.attack(epsilon=config['eps'], alpha=config['alpha'], is_first_attack=(t==0))
          if t != K-1:
            model.zero_grad()
          else:
            pgd.restore_grad()

          with amp.autocast(enabled=config['use_amp']):
            loss_adv = model(**batch_cuda)[0]
            if config['n_gpus'] > 1:
              loss_adv = loss_adv.mean()

          scaler.scale(loss_adv).backward()
        pgd.restore()

      scaler.step(optimizer)
      scaler.update()

      lr_scheduler.step()
      optimizer.zero_grad()

      if config['ema_start']:
        ema.update()

      train_loss += loss.item()
      global_steps +=1

      train_iterator.set_postfix_str(f'running training loss: {loss.item():.4f}')


      if global_steps % config['logging_step'] == 0:
        if global_steps >= config['ema_start_step'] and not config['ema_start']:
          print('\n>>>EMA starting...')
          config['ema_start'] = True
          ema = EMA(model.module if hasattr(model, 'module') else model, decay=0.999)

        print_train_loss = (train_loss - logging_loss)/ config['logging_step'] 
        logging_loss = train_loss

        if config['ema_start']:
          ema.apply_shadow()

        val_loss, f1, acc = evaluation(model, config, dev_dataloader)
        print_log = f'\n>>> training loss: {print_train_loss:.6f}, valid loss: {val_loss:.6f}, '

        if acc > best_acc:
          model_save_path = os.path.join(config['output_path'],
                                         f'checkpoint-{global_steps}-{acc:.6f}')
          model_to_save = model.module if hasattr(model, 'module') else model
          model_to_save.save_pretrained(model_save_path)
          best_acc = acc
          best_model_path = model_save_path
        
        print_log += f'valid f1: {f1:.6f}, valid acc: {acc:.6f}'
        print(print_log)
        model.train()

        if config['ema_start']:
          ema.restore()

  return model, best_model_path

In [None]:
best_model, best_model_path = train(config, train_dataloader, dev_dataloader)

In [None]:
def predict(config, id2label, model, test_dataloader):
  model.eval()
  test_iterator = tqdm(test_dataloader, desc='Predicting', total=len(test_dataloader))
  test_preds =[]

  with torch.no_grad():
     for batch in test_iterator:
       # ---------------------------------------------#
       batch = {k: v.to(config['device']) for k,v in batch.items}
       logits = model(**batch)[1]
       # ---------------------------------------------#
       test_preds.append(logits.argmax(dim=-1).detach().cpu())
  test_preds = torch.cat(test_preds, dim=0).numpy()
  test_preds = [id2label[idx] for idx in test_preds]

  test_df = pd.read_csv(config['test_file_path'], sep=',')
  test_df['preds'] = test_preds
  
  return test_df