In [1]:
from transformers import BertTokenizerFast
import numpy as np
import torch
learning_rate = 5e-5
batch_size = 80
n_epochs = 10

tokenizer = BertTokenizerFast.from_pretrained('../bert-base-chinese')
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from collections import namedtuple
def text2token(txt):
  d = tokenizer(txt,padding="max_length", truncation=True,return_tensors='pt',max_length=64)
  d.update((k,v[0]) for k,v in d.items())
  return d
LabeledData = namedtuple("LabeledData",['token','label'])

In [3]:
import label_loader

def load_raw_to_data(path : str,label_path:str = 'labels.tsv',sep='\t'):
  ret = []
  label_idx = label_loader.load_label_index(label_path)
  with open(path,encoding='utf-8') as f:
    for l in f:
      grp = str(l).strip().split(sep)
      ret.append(LabeledData(text2token(grp[1]),label_idx.get(grp[0],-1)))
  return ret
t_lihkg = load_raw_to_data('out_with_labels.tsv')
label_idx = label_loader.load_label_index('labels.tsv')

In [4]:
from torch.utils.data import DataLoader
import torch
train_dataloader = DataLoader([*t_lihkg], shuffle=True, batch_size=batch_size)
#eval_dataloader = DataLoader([*t_lihkg[:1000]], batch_size=batch_size)

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("../bert-base-chinese", num_labels=len(label_idx))
model.to(device)
pass

Some weights of the model checkpoint at ../bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../bert-base-chinese and

In [6]:
from torch import nn
from torch.optim import AdamW
from transformers import get_scheduler
import torch

optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_steps = n_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [7]:
from tqdm import tqdm

model.train()
for epoch in range(n_epochs):
    loop = tqdm(train_dataloader, leave=True)
    for batch in loop:
        input_ids = batch.token['input_ids'].to(device)
        attention_mask = batch.token['attention_mask'].to(device)
        label = batch.label.to(device)
        outputs = model(labels=label,input_ids=input_ids,attention_mask=attention_mask)
        
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        torch.cuda.empty_cache()


Epoch 0: 100%|██████████| 2334/2334 [49:25<00:00,  1.27s/it, loss=1.2]  
Epoch 1: 100%|██████████| 2334/2334 [49:22<00:00,  1.27s/it, loss=1.09] 
Epoch 2: 100%|██████████| 2334/2334 [49:05<00:00,  1.26s/it, loss=1.13] 
Epoch 3: 100%|██████████| 2334/2334 [49:19<00:00,  1.27s/it, loss=0.78] 
Epoch 4: 100%|██████████| 2334/2334 [50:15<00:00,  1.29s/it, loss=0.649]
Epoch 5: 100%|██████████| 2334/2334 [49:04<00:00,  1.26s/it, loss=0.216]
Epoch 6: 100%|██████████| 2334/2334 [49:11<00:00,  1.26s/it, loss=0.279] 
Epoch 7: 100%|██████████| 2334/2334 [49:09<00:00,  1.26s/it, loss=0.323] 
Epoch 8: 100%|██████████| 2334/2334 [49:04<00:00,  1.26s/it, loss=0.0686]
Epoch 9: 100%|██████████| 2334/2334 [49:10<00:00,  1.26s/it, loss=0.0188]


In [8]:
model.save_pretrained("channel-classifier-man")

In [None]:
from tqdm import tqdm
model.eval()
torch.cuda.empty_cache()
eval_loop = tqdm(train_dataloader, leave=True)
loss_list = []
for batch in eval_loop:
    input_ids = batch.token['input_ids'].to(device)
    attention_mask = batch.token['attention_mask'].to(device)
    label = batch.label.to(device)
    outputs = model(labels=label,input_ids=input_ids,attention_mask=attention_mask)
    
    loss = outputs.loss
    loss_list.append(float(loss))

    eval_loop.set_postfix(loss=loss.item())
    torch.cuda.empty_cache()
loss_list