In [1]:
import os
path = "/content/drive/My Drive/NLP/nlp_AFQMC"
os.chdir(path)

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 2.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 6.8MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 12.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |██████████

In [3]:
import time
import json
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import transformers
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer,BertConfig
from transformers import BertModel,BertForSequenceClassification
from torch.optim import lr_scheduler
from utils import get_time_diff, print_ans, EarlyStopping
from torch.utils.data import DataLoader, Dataset
from config import Config


In [4]:
%reload_ext autoreload
%autoreload 2

In [5]:
def seed_everything(seed):
    # 设置随机种子
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(42)

In [6]:
class SentenceDataset(Dataset):
    def __init__(self, df, config):
        self.df = df
        self.max_len = config.max_len
        self.labeled = 'label' in df
        self.tokenizer = BertTokenizer.from_pretrained(config.bert_vocab_path)
        #self.tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") 

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        input_features = self.tokenizer.encode_plus(str(row.sentence1), str(row.sentence2), 
                            max_length=self.max_len * 2, pad_to_max_length=True, add_special_tokens=True,
                            return_token_type_ids=True, return_attention_mask=True)
        data['ids'] = torch.tensor(input_features['input_ids']) # 获取token id
        data['masks'] = torch.tensor(input_features['attention_mask']) # 获取掩码
        data['token_type_ids'] = torch.tensor(input_features['token_type_ids']) # 获取句子归属
        
        if self.labeled:
            data['label'] = torch.tensor(row.label)
        return data
      
    def __len__(self):
        return len(self.df)

In [7]:
def get_dataLoader(sentence_dataset, batch_size, shuffle=True): 
    # 获取 data loader
    loader = DataLoader(
        sentence_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=False
    )
    return loader


In [8]:
class FGM():
    # 对抗训练
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='word_embeddings'):
        # 对词嵌入进行扰动
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
         # 恢复
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

In [9]:
class BertModelBase(nn.Module):
    # bert 模型
    def __init__(self, config, mulit_dropout=False):
        super(BertModelBase, self).__init__()
        self.bert_config = BertConfig.from_pretrained(
            config.bert_config_path, output_hidden_states=True)      
        self.bert = BertModel.from_pretrained(
            config.bert_model_path, config=self.bert_config)
        self.fc = nn.Linear(self.bert_config.hidden_size, 2)
        self.mulit_dropout = mulit_dropout
        self.dropout = nn.Dropout(0.2)
        self.dropouts = nn.ModuleList([nn.Dropout(0.2) for _ in range(6)])
        nn.init.normal_(self.fc.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, token_type_ids, attention_mask):
        last_hidden_state, pooler_output, hidden_states = self.bert(input_ids=input_ids, 
                                        attention_mask=attention_mask,
                                        token_type_ids=token_type_ids)
        if self.mulit_dropout: # multi sample dropout
            predict = torch.mean(
                torch.stack(
                    [self.fc(dropout(pooler_output)) for dropout in self.dropouts],
                    dim=0
                ),
                dim=0
            )
        else:
            predict = self.fc(self.dropout(pooler_output))
        return predict

In [10]:
def get_loss(pred, label, smoothing=True, eps=0.1):
  # 标签平滑，未加入进行实验
  if smoothing:
    log_pred = F.log_softmax(pred, dim=1)
    one_hot = torch.zeros_like(pred).scatter(1, label.view(-1,1), 1)
    smooth_label = one_hot * (1.0 - eps) + (1.0 - one_hot) * eps
    loss = -(log_pred * smooth_label)
    loss = loss.sum(dim=-1).mean()
    return loss
  else:
    return F.cross_entropy(pred, label)

In [11]:
def train(data_loader, model, device, optimizer, use_fgm=True):
    # 训练模型
    optimizer.zero_grad()
    model.train()
    start_time = time.time()
    criterion = nn.CrossEntropyLoss()
    if use_fgm: # 开启对抗训练
        fgm = FGM(model)
    for i, batch in enumerate(data_loader):
        ids = batch['ids']
        masks = batch['masks']
        token_type_ids = batch['token_type_ids']
        labels = batch['label']
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        masks = masks.to(device, dtype=torch.long)
        labels = labels.to(device, dtype=torch.long)
        optimizer.zero_grad()
        outputs = model(ids, token_type_ids, masks)
        loss = criterion(outputs.view(-1, 2), labels.view(-1))
        loss.backward()
        # trick 对抗训练
        if use_fgm:
            fgm.attack()  ##对抗训练
            adv_outputs = model(ids, token_type_ids, masks)
            loss_adv = criterion(adv_outputs.view(-1, 2), labels.view(-1))
            loss_adv.backward()
            fgm.restore()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 梯度截断
        optimizer.step()
        if (i + 1) % 200 == 0:
            true_label = labels.data.cpu().numpy()
            predict = torch.max(outputs, dim=1)[1].cpu().numpy()
            train_acc = metrics.accuracy_score(true_label, predict)
            time_diff = get_time_diff(start_time)
            msg = 'Iter:{0:>6} Train loss: {1:>5.3} Train acc:{2:>6.2%} Time:{3}'
            print(msg.format(i + 1, loss.item(), train_acc, time_diff))

def dev(data_loader, model, device):
    # 验证集验证
    model.eval()
    total_loss = 0
    predicts_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    criterion = nn.CrossEntropyLoss()
    start_time = time.time()
    with torch.no_grad():
        for batch in data_loader:
            ids = batch['ids']
            masks = batch['masks']
            token_type_ids = batch['token_type_ids']
            labels = batch['label']
            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            masks = masks.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.long)
            outputs = model(ids, token_type_ids, masks)
            loss = criterion(outputs, labels.view(-1))
            total_loss += loss.item()
            predict = torch.max(outputs, dim=1)[1].cpu().numpy()
            labels = labels.data.cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predicts_all = np.append(predicts_all, predict)
    acc = metrics.accuracy_score(labels_all, predicts_all)
    report = metrics.classification_report(labels_all, predicts_all, digits=4)
    confusion = metrics.confusion_matrix(labels_all, predicts_all)
    f1_score = metrics.f1_score(labels_all, predicts_all, average='macro')
    return acc, total_loss / len(data_loader), report, confusion, f1_score

def inference(config):
    # 预测
    predicts_all = np.array([], dtype=int)
    test_df = pd.read_csv(config.test_data_path)
    test_dataset = SentenceDataset(test_df, config)
    test_loader = get_dataLoader(test_dataset, config.batch_size, shuffle=False)
    device = config.device
    model = BertModelBase(config)
    model.to(device)
    model.eval()
    with torch.no_grad():
      for batch in test_loader:
        ids = batch['ids']
        masks = batch['masks']
        token_type_ids = batch['token_type_ids']
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        masks = masks.to(device, dtype=torch.long)
        outputs = model(ids, token_type_ids, masks)
        predict = torch.max(outputs, dim=1)[1].cpu().numpy()
        predicts_all = np.append(predicts_all, predict)

    output_submit_file = os.path.join(config.predict_output_path)
    # 保存标签结果
    with open(output_submit_file, "w") as writer:
        for i, pred in enumerate(predicts_all):
            json_d = {}
            json_d['id'] = i
            json_d['label'] = str(pred)
            writer.write(json.dumps(json_d) + '\n')

In [12]:
def run(config):
    # 加载数据
    train_df = pd.read_csv(config.train_data_path)
    dev_df = pd.read_csv(config.dev_data_path)
    train_dataset = SentenceDataset(train_df, config)
    dev_dataset = SentenceDataset(dev_df, config)
    train_loader = get_dataLoader(train_dataset, config.batch_size, shuffle=False)
    dev_loader = get_dataLoader(dev_dataset, config.batch_size, shuffle=False)
    # 加载模型
    model = BertModelBase(config, True)
    model.to(config.device)
    no_decay = ["bias", "LayerNorm.weight"]
    param_optimizer = list(model.named_parameters())
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=config.learn_rate, betas=(0.9, 0.999))
    es = EarlyStopping(patience=2)
    dev_best_loss = float('inf')
    # 开始训练
    for i in range(config.epoch_num):
        train(train_loader, model, config.device, optimizer)
        dev_acc, dev_loss, report, confusion, f1_score = dev(dev_loader, model, config.device)
        print_ans(dev_acc, dev_loss, report, confusion)
        es(f1_score, model, config.model_save_path)
        if es.early_stop:
          print("Early stopping")
          break
    # 载入最优模型
    model.load_state_dict(torch.load(config.model_save_path))
    dev_acc, dev_loss, report, confusion, f1_score = dev(dev_loader, model, config.device)
    print_ans(dev_acc, dev_loss, report, confusion)

In [13]:
config = Config()
config.model_save_path = 'saveModel/bert_base.pt'
config.learn_rate = 2e-5 # 调低学习率，不然效果特别差

In [14]:
run(config)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


Iter:   200 Train loss: 0.613 Train acc:67.19% Time:0:02:47
Iter:   400 Train loss: 0.492 Train acc:75.00% Time:0:05:33
Dev Loss: 0.51, Dev Acc:71.99%
Precision, Recall and F1-Score...
              precision    recall  f1-score   support

           0     0.7445    0.9043    0.8167      2978
           1     0.5923    0.3094    0.4065      1338

    accuracy                         0.7199      4316
   macro avg     0.6684    0.6069    0.6116      4316
weighted avg     0.6973    0.7199    0.6895      4316

Confusion Matrix...
[[2693  285]
 [ 924  414]]
Validation score improved (-inf --> 0.6115797101611097). Saving model!
Iter:   200 Train loss: 0.523 Train acc:71.88% Time:0:02:46
Iter:   400 Train loss: 0.421 Train acc:84.38% Time:0:05:32
Dev Loss:  0.5, Dev Acc:73.84%
Precision, Recall and F1-Score...
              precision    recall  f1-score   support

           0     0.7951    0.8365    0.8153      2978
           1     0.5883    0.5202    0.5522      1338

    accuracy         

In [None]:
inference(config)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [None]:
import gc
gc.collect() 

14899

In [None]:
torch.cuda.empty_cache()