In [1]:
!nvidia-smi

Thu Jul  8 07:31:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 6.6MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 29.0MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |█████

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertConfig, BertTokenizer
import numpy as np
from scipy.stats import pearsonr, spearmanr
from collections import deque
from tqdm import tqdm
import time
import json

BATCH_SIZE = 64
LEARNING_RATE = 1e-5
EPOCHS = 100
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# train_file_path = "./LCQMC.3way.json"
# test_file_path = "../resource/senteval_cn/STS-B/STS-B.test.data"

train_file_path = "/content/drive/MyDrive/dataset/semantic_compare/snli/snli.3way.json"
train_s2_file_path = "/content/drive/MyDrive/dataset/semantic_compare/senteval_cn/LCQMC/LCQMC.train.data"
test_file_path = "/content/drive/MyDrive/dataset/semantic_compare/senteval_cn/STS-B/STS-B.test.data"

pretrained_model_path = "hfl/chinese-roberta-wwm-ext"
# pretrained_model_path = "hfl/chinese-bert-wwm-ext"

bert_config = BertConfig.from_pretrained(pretrained_model_path)
bert_config.hidden_dropout_prob = 0.3
bert_config.attention_probs_dropout_prob = 0.3
tokenizer = BertTokenizer.from_pretrained(pretrained_model_path)


class TrainDataset(Dataset):
    def __init__(self):
        with open(train_file_path, "r", encoding="utf-8") as f:
            self.dataset = json.load(f)
        self.dataset_length = len(self.dataset)
        print("load TrainDataset size:", self.dataset_length)

    def text_to_ids(self, text_anchor: str, text_positive: str, text_negative: str):
        return tokenizer([text_anchor, text_positive, text_negative], padding="max_length", max_length=64, truncation=True,
                         return_tensors="pt")

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        text_anchor = sample["text_anchor"]
        text_positive = sample["text_positive"]
        text_negative = sample["text_negative"]
        return self.text_to_ids(text_anchor, text_positive, text_negative)

    def __len__(self):
        return self.dataset_length


class TrainS2Dataset(Dataset):
    def __init__(self):
        self.dataset = []
        with open(train_s2_file_path, "r", encoding="utf-8") as f:
            for line in f:
                text_a, text_b, score = line.strip().split("\t")
                self.dataset.append(text_a)
                # self.dataset.append(text_b)
        self.dataset_length = len(self.dataset)
        print("load TrainS2Dataset size:", self.dataset_length)

    def text_to_ids(self, text: str):
        return tokenizer([text, text], padding="max_length", max_length=64, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        return self.text_to_ids(sample)

    def __len__(self):
        return 50000


class TestDataset(Dataset):
    def __init__(self):
        self.dataset = []
        with open(test_file_path, "r", encoding="utf-8") as f:
            for line in f:
                text_a, text_b, score = line.strip().split("\t")
                self.dataset.append({
                    "text_a": text_a,
                    "text_b": text_b,
                    "score": int(score)
                })
        self.dataset_length = len(self.dataset)
        print("load TestDataset size:", self.dataset_length)

    def text_to_ids(self, text: str):
        return tokenizer(text, padding="max_length", max_length=64, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        return self.text_to_ids(sample["text_a"]), self.text_to_ids(sample["text_b"]), sample["score"]

    def __len__(self):
        return self.dataset_length


def calc_loss(embedding, lamda=0.05):
    batch_size = embedding.shape[0]
    sim_score = F.cosine_similarity(embedding.unsqueeze(dim=1), embedding.unsqueeze(dim=0), dim=-1)

    sim_score = torch.index_select(sim_score, dim=0, index=torch.arange(0, batch_size, step=3, device=device))

    col_select_idx = torch.stack([torch.arange(1, batch_size, step=3), torch.arange(1, batch_size, step=3)], dim=1).reshape(-1).to(
        device)
    sim_score = torch.index_select(sim_score, dim=1, index=col_select_idx)

    sim_score = sim_score / lamda

    y_true = torch.arange(0, batch_size * 2 // 3, step=2, device=device)
    loss = F.cross_entropy(sim_score, y_true)
    return loss


def calc_loss_s2(embedding, lamda=0.05):
    batch_size = embedding.shape[0]
    embedding_a = torch.index_select(embedding, dim=0, index=torch.arange(0, batch_size, step=2, device=device))
    embedding_b = torch.index_select(embedding, dim=0, index=torch.arange(1, batch_size, step=2, device=device))

    sim_score = F.cosine_similarity(embedding_a.unsqueeze(dim=0), embedding_b.unsqueeze(dim=1), dim=-1)
    sim_score = sim_score / lamda

    y_true = torch.arange(0, batch_size // 2, step=1, device=device)
    loss = F.cross_entropy(sim_score, y_true)
    return loss


def test():
    model.eval()
    pbar = tqdm(dataloader_test, position=0, leave=True)
    pbar.set_description("test epoch {}".format(epoch))
    similarity_label = []
    similarity_predict = []
    for input_encoding_a, input_encoding_b, y_target in pbar:
        input_encoding_a = {k: v.reshape(v.shape[0] * v.shape[1], *v.shape[2:]).to(device) for k, v in input_encoding_a.items()}
        input_encoding_b = {k: v.reshape(v.shape[0] * v.shape[1], *v.shape[2:]).to(device) for k, v in input_encoding_b.items()}
        y_target = y_target.to(device)
        embedding_a = model(**input_encoding_a)[1]
        embedding_b = model(**input_encoding_b)[1]

        similarity = torch.cosine_similarity(embedding_a, embedding_b).detach().cpu().numpy()
        y_target = y_target.detach().cpu().numpy()

        similarity_predict.append(similarity)
        similarity_label.append(y_target)

    similarity_predict = np.concatenate(similarity_predict, axis=0)
    similarity_label = np.concatenate(similarity_label, axis=0)
    print("spearmanr:", spearmanr(similarity_predict, similarity_label))
    time.sleep(0.5)


def train():
    model.train()
    train_loss = deque([], maxlen=100)
    pbar = tqdm(dataloader_train, position=0, leave=True)
    pbar.set_description("train epoch {}".format(epoch))
    for input_encoding in pbar:
        input_encoding = {k: v.reshape(v.shape[0] * v.shape[1], *v.shape[2:]).to(device) for k, v in input_encoding.items()}
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            embedding = model(**input_encoding)[1]
            loss = calc_loss(embedding)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        train_loss.append(loss.item())
        log_str = "loss={}".format(np.mean(train_loss))
        pbar.set_postfix_str(log_str)


def train_s2():
    model.train()
    train_loss = deque([], maxlen=100)
    pbar = tqdm(dataloader_train_s2, position=0, leave=True)
    pbar.set_description("train_s2 epoch {}".format(epoch))
    for input_encoding in pbar:
        input_encoding = {k: v.reshape(v.shape[0] * v.shape[1], *v.shape[2:]).to(device) for k, v in input_encoding.items()}
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            embedding = model(**input_encoding)[1]
            loss = calc_loss_s2(embedding)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        train_loss.append(loss.item())
        log_str = "loss={}".format(np.mean(train_loss))
        pbar.set_postfix_str(log_str)


if __name__ == '__main__':
    dataset_train = TrainDataset()
    dataloader_train = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    dataset_train_s2 = TrainS2Dataset()
    dataloader_train_s2 = DataLoader(dataset=dataset_train_s2, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    dataset_test = TestDataset()
    dataloader_test = DataLoader(dataset=dataset_test, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

    model = BertModel.from_pretrained(pretrained_model_path)
    model.to(device)

    scaler = torch.cuda.amp.GradScaler()

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    for epoch in range(100):
        # test
        test()

        # train()

        train()

        # train
        # if epoch % 2 == 1:
        #     train()
        # else:
        #     train_s2()

        torch.save(model.state_dict(), f"./model_{epoch}.pth")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=689.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=19.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=268961.0, style=ProgressStyle(descripti…


load TrainDataset size: 153402
load TrainS2Dataset size: 238766
load TestDataset size: 1361


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411578458.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
test epoch 0: 100%|██████████| 22/22 [00:05<00:00,  4.00it/s]


spearmanr: SpearmanrResult(correlation=0.6187023976848138, pvalue=1.3841318483174585e-144)


train epoch 0:   4%|▎         | 88/2397 [01:34<41:17,  1.07s/it, loss=2.804548255421899]

KeyboardInterrupt: ignored

In [None]:
!cp model_9.pth /content/drive/MyDrive/dataset