In [45]:
from abc import ABC

import os
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader

In [46]:
train_fullname = './data/train_data_public.csv'
test_fullname = './data/test_public.csv'
device='cpu'

In [47]:
class AnnoDataSet(Dataset, ABC):
    def __init__(self,texts: list[str], labels: list[str] = None):
        super(Dataset, self).__init__()
        self.texts = texts
        self.labels = labels

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item] if self.labels is not None else None
        return text, label

    def __len__(self):
        return len(self.labels)

In [48]:
def get_dataset() -> (AnnoDataSet, AnnoDataSet, AnnoDataSet):
    train_set, valid_set, test_set = None, None, None

    train_raw = pd.read_csv(train_fullname)
    train_texts: list[str] = [text.strip() for text in train_raw['text'].to_list()]
    train_labels: list[str] = [label_line.strip() for label_line in train_raw['BIO_anno'].to_list()]

    train_set = AnnoDataSet(train_texts, train_labels)

    test_raw = pd.read_csv(test_fullname)
    test_texts: list[str] = [text.strip() for text in test_raw['text'].to_list()]
    test_labels = None

    test_set = AnnoDataSet(test_texts, test_labels)

    return train_set, valid_set, test_set

In [49]:
# 编码labels
class SimpleVocab:
    def __init__(self):
        labels = pd.read_csv(train_fullname)['BIO_anno'].to_list()
        labels = [label_line.split() for label_line in labels]
        all_tokens = [token for label_line in labels for token in label_line] # 这个写法每次看一遍都觉得震撼
        self.token_dict = {}
        self.token_array = []

        # O means none
        self.token_dict['O'] = len(self.token_array)
        self.token_array.append('O')

        for token in all_tokens:
            if token not in self.token_dict:
                self.token_dict[token] = len(self.token_array)
                self.token_array.append(token)

    def __call__(self, tokens):
        assert isinstance(tokens, (list, tuple, str))
        if isinstance(tokens, (list, tuple)):
            return [self(token) for token in tokens]
        else:
            return self.token_dict[tokens]

    def __len__(self):
        return len(self.token_array)

    def to_tokens(self, ids):
        assert isinstance(ids, (list, tuple, int))
        if isinstance(ids, (list, tuple)):
            return [self.to_tokens(idx) for idx in ids]
        else:
            return self.token_array[ids]

    def get_none_token(self):
        return 'O'

    def get_none_id(self):
        return self('O')

In [50]:
def collect(sample:list[tuple[str, str]], tokenizer, label_vocab) -> dict[torch.Tensor]:
    texts, labels = zip(*sample)
    tokens = [list(text) for text in texts]
    tokenized = tokenizer(tokens, padding=True, is_split_into_words=True, return_tensors='pt')

    if labels[0] is None:
        return tokenized


    labels = [label_line.split() for label_line in labels]

    # 对齐label
    none_anno = label_vocab.get_none_token()
    max_len = tokenized['input_ids'][0].shape[-1]
    for idx in range(len(labels)):
        labels[idx] = [none_anno] + labels[idx] + [none_anno]
        # 去除空格导致的标记串过长，理论上超长部分都是'O'，可直接截断
        if len(labels[idx]) > max_len:
            labels[idx] = labels[idx][:max_len]
        # 将标记串填充至max_len
        elif len(labels[idx]) < max_len:
            labels[idx] += [none_anno] * (max_len - len(labels[idx]))
        assert len(labels[idx]) == max_len, f'{len(labels[idx])}, {max_len}'

    tokenized['labels'] = torch.tensor(label_vocab(labels), dtype=torch.int32)
    return tokenized


In [51]:
model_name = 'bert-base-chinese'
bert_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', model_name)

train_dataset, valid_dataset, test_dataset = get_dataset()

Using cache found in C:\Users\Justi/.cache\torch\hub\huggingface_pytorch-transformers_main


In [52]:
batch_size = 128
label_vocab = SimpleVocab()
train_iter = DataLoader(
    train_dataset,
    batch_size,
    collate_fn=lambda x: collect(x, bert_tokenizer, label_vocab),
    num_workers=0
)

# 开炼！

In [53]:
import numpy as np

def compute_acc(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    cnt = 0
    for i in range(len(labels)):
        if predictions[i] == labels[i]:
            cnt += 1
    return 1.0 * cnt / len(labels)

In [60]:
from torch import nn
class BertClassification(nn.Module):
    def __init__(self, num_classes, dropout=0):
        super().__init__()
        self.bert = torch.hub.load('huggingface/pytorch-transformers', 'model', model_name)
        self.dropout = nn.Dropout(dropout)
        hidden_size = 768 # fuck it
        self.Linear = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, token_type_ids, attention_mask):
        out = self.bert(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )['last_hidden_state']
        return self.Linear(self.dropout(out))


In [62]:
from transformers import AdamW, get_scheduler

base_net = BertClassification(len(label_vocab))
base_net.to(device)
num_epochs = 10
lr = 1e-6
num_training_steps = num_epochs * len(train_iter)

optimizer = AdamW(base_net.parameters())
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,  # scheduler是针对optimizer的lr的
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

loss = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    for batch in train_iter:
        # 要在GPU上训练，需要把数据集都移动到GPU上：
        batch = {k:v.to(device) for k,v in batch.items()}
        output = base_net(
            input_ids=batch['input_ids'],
            token_type_ids=batch['token_type_ids'],
            attention_mask=batch['attention_mask']
        )
        # print(output.shape)
        # print(batch['labels'].shape)
        l = loss(output.permute(0,2,1), batch['labels'])
        l.sum().backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        print(f'loss:{l}')

Using cache found in C:\Users\Justi/.cache\torch\hub\huggingface_pytorch-transformers_main
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 48666624 bytes.

In [41]:
for k, v in next(iter(train_iter)).items():
    print(f'{k}:{v.shape}')

input_ids:torch.Size([128, 89])
token_type_ids:torch.Size([128, 89])
attention_mask:torch.Size([128, 89])
labels:torch.Size([128, 89])


In [39]:
print(torch.hub.help('huggingface/pytorch-transformers', 'modelForSequenceClassification'))


    This is a generic model class that will be instantiated as one of the model classes of the library (with a sequence classification head) when created
    with the [`~AutoModelForSequenceClassification.from_pretrained`] class method or the [`~AutoModelForSequenceClassification.from_config`] class
    method.

    This class cannot be instantiated directly using `__init__()` (throws an error).

            # Using torch.hub !
            import torch

            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attentions=True)  # Update configuratio

Using cache found in C:\Users\Justi/.cache\torch\hub\huggingface_pytorch-transformers_main
