In [1]:
import polars as pl
import pandas as pd

In [2]:
data_train = pl.read_csv('data/train.csv')
data_train = data_train.drop_nulls()
data_train

sentence,label
str,str
"""刚开始是牙炎痛后面又口腔溃疡""","""牙齿炎症治疗"""
"""我想问一下整个过程大概要多久""","""无主题"""
"""就是我想质询智齿""","""智齿"""
"""请问免疫抑制剂对口腔溃疡恢复有什么作用呢？""","""口腔溃疡"""
"""补牙是要么收费的""","""补牙"""
…,…
"""有点儿敏感""","""牙齿脱敏"""
"""没有呢？今天才痛的""","""牙齿疼痛"""
"""以前最早是补了? 后来医生建议拔掉? 因为最后一颗牙 没什么…","""拔牙"""
"""下牙，从里面数出来第2颗，右边""","""无主题"""


# Data process

In [3]:
def data_process(filepath):
    data = pl.read_csv(filepath)
    data = data.drop_nulls()
    item2id, id2item = {}, {}
    for i, label in enumerate(data['label'].value_counts().sort('count', descending=True)['label']):
        item2id[label] = i
        id2item[i] = label
    data = data.with_columns(data["label"].replace(item2id).cast(pl.Int64).alias("label"))
    return data, item2id, id2item

In [4]:
data = pl.read_csv('data/train.csv')
data = data.drop_nulls()
item2id, id2item = {}, {}
for i, label in enumerate(data['label'].value_counts().sort('count', descending=True)['label']):
    item2id[label] = i
    id2item[i] = label
print(data['label'].value_counts().sort('count', descending=True))

shape: (24, 2)
┌──────────┬───────┐
│ label    ┆ count │
│ ---      ┆ ---   │
│ str      ┆ u32   │
╞══════════╪═══════╡
│ 无主题   ┆ 15257 │
│ 拔牙     ┆ 3806  │
│ 补牙     ┆ 3644  │
│ 牙齿矫正 ┆ 3380  │
│ 牙齿疼痛 ┆ 3304  │
│ …        ┆ …     │
│ 牙齿脱敏 ┆ 624   │
│ 牙齿美白 ┆ 575   │
│ 窝沟封闭 ┆ 111   │
│ 美牙冠   ┆ 90    │
│ 涂氟防龋 ┆ 63    │
└──────────┴───────┘


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification


class MyDataset(Dataset):
    
    def __init__(self, filepath):
        super().__init__()
        self.data, self.item2id, self.id2item = data_process(filepath)
    
    def __getitem__(self, index):
        row = self.data.row(index)
        return row[0], row[1]
    
    def __len__(self):
        return len(self.data)
    
train_data = MyDataset('data/train.csv')
test_data = MyDataset('data/test.csv')

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

def collate_fn(batch):
    texts, labels = [], []
    for item in batch:
        texts.append(item[0])
        labels.append(item[1])
    inputs = tokenizer(texts, padding = True, return_tensors = 'pt')
    inputs['labels'] = torch.tensor(labels)
    return inputs

trainloader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate_fn)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
train_data[0]

('刚开始是牙炎痛后面又口腔溃疡', 12)

In [7]:
next(enumerate(trainloader))

(0,
 {'input_ids': tensor([[ 101, 2769, 2682,  ...,    0,    0,    0],
         [ 101, 1486, 6418,  ...,    0,    0,    0],
         [ 101,  872,  812,  ...,    0,    0,    0],
         ...,
         [ 101, 4385, 1762,  ...,    0,    0,    0],
         [ 101, 3175,  912,  ...,    0,    0,    0],
         [ 101,  671, 5663,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([ 3,  8,  7, 16, 13,  0, 11,  0,  0,  6,  3,  3,  0,  0,  8, 18,  5, 11,
         12,  7, 11,  7, 16,  6,  1,  0,  0,  9, 14, 11,  0,  3, 16,  6,

In [8]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-chinese', num_labels=len(train_data.item2id))

if torch.cuda.is_available():
        device = 'cuda'
else:
        device = 'cpu'

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
optimizer = Adam(model.parameters(), lr = 1e-5)

In [10]:
model(**next(enumerate(trainloader))[1].to(device))

SequenceClassifierOutput(loss=tensor(3.6318, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.6133, -0.1696, -0.5780,  ...,  0.4765,  0.4000,  0.3436],
        [-0.6161, -0.4710, -0.9286,  ...,  0.4833,  0.1897,  0.1197],
        [-0.8817, -0.3529, -0.5881,  ...,  0.5664, -0.1477,  0.0103],
        ...,
        [-0.4782, -0.1858, -0.6568,  ...,  0.5218,  0.5450,  0.2358],
        [-1.2511, -0.6045, -1.1756,  ...,  0.6348, -0.0875, -0.2760],
        [-0.3957, -0.0640, -0.6537,  ...,  0.4307,  0.4840,  0.5615]],
       device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [12]:
def train(epoch=1, log_step=100):
    
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            batch.to(device)
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f'epoch: {ep}, global_step: {global_step}, loss: {output.loss.item()}')
            global_step += 1

train()

epoch: 0, global_step: 0, loss: 3.720447063446045
epoch: 0, global_step: 100, loss: 1.6409595012664795


KeyboardInterrupt: 