<a href="https://colab.research.google.com/github/JieShenAI/torch/blob/main/huggingface/example/bbc_news_classification/bbc_news_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_path = "drive/MyDrive/github/torch/example/data/"
model_path = "drive/MyDrive/model/"

## 导包

In [None]:
# BBC 文章分类
# 数据集来源: https://www.kaggle.com/competitions/learn-ai-bbc/
# 代码参考: https://mp.weixin.qq.com/s/00on_zUFjAmnoSb_8j0QMw

import torch
from torch import nn
from transformers import BertModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
import pandas as pd
from torch.optim import Adam
from tqdm import tqdm
import time

In [None]:
labels = {
    'business': 0,
    'entertainment': 1,
    'sport': 2,
    'tech': 3,
    'politics': 4
}

label_name = {v: k for k, v in labels.items()}

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = {
    'batch': 2,
    'epochs': 4,
    'device': device,
    'bert_path': 'bert-base-cased',
    'train_path': data_path + 'BBC News Train.csv',
    'test_path': data_path + 'BBC News Test.csv',
    'model_path': model_path + 'bert_bbc_class.pth'
}
device = config['device']

## 数据集

In [None]:
class SampleDataset(Dataset):
    def __init__(self, df,
                 is_train=True,
                 tokenizer=AutoTokenizer.from_pretrained('bert-base-cased')):
        self.is_train = is_train
        if is_train:
            self.labels = [labels[label] for label in df['Category']]
        self.texts = [tokenizer(text,
                      padding='max_length',
                      max_length=512,
                      truncation=True,
                      return_tensors="pt")
                      for text in df['Text']]

    def classes(self):
        assert self.is_train
        return self.labels

    def __len__(self):
        return len(self.texts)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        assert self.is_train
        # return torch.tensor([self.labels[idx]])
        return self.labels[idx]
        # TODO: 没有转成tensor, 但dataloder中的label是tensor

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        if not self.is_train:
            return self.get_batch_texts(idx)
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y



def get_dataset():
  df = pd.read_csv(config['train_path'])
  dataset = SampleDataset(df)
  # torch切分数据集
  trainset, validset, testset = random_split(dataset, [0.8, 0.1, 0.1])
  return trainset, validset, testset

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

## 模型

In [None]:
# class BertClassifier(nn.Module):
#     def __init__(self, dropout=0.5):
#         super().__init__()
#         self.bert = BertModel.from_pretrained('bert-base-cased')
#         self.dropout = nn.Dropout(dropout)
#         self.linear1 = nn.Linear(768, 256)
#         self.linear2 = nn.Linear(256, 5)
#         self.relu = nn.ReLU()

#     # def forward(self, input_id, mask):
#     def forward(self, input):
#         output = self.bert(**input)
#         pooler_output = output.pooler_output
#         dropout_output = self.dropout(pooler_output)
#         linear_output = self.linear1(dropout_output)
#         layer = self.relu(linear_output)
#         return self.linear2(layer)


## 训练

In [None]:
trainset, validset, testset = get_dataset()

In [None]:
train_dataloader = DataLoader(trainset, batch_size=config['batch'], shuffle=True, pin_memory=True)
val_dataloader = DataLoader(validset, batch_size=config['batch'], shuffle=False, pin_memory=True)

In [None]:
# for feature, label in train_dataloader:
#   label = label.to(device)
#   # for k,v in feature.items():
#   #   print(k,v.shape)
#   input = {k: v.squeeze(1).to(device) for k, v in feature.items()}
#   v = model(**input,labels=label)
#   loss = v['loss']
#   logits = v['logits']
#   print(loss)
#   print(logits.shape)
#   break

In [None]:
len(train_dataloader), len(val_dataloader)

(596, 75)

In [None]:
def run_one_epoch(model, dataloader, is_train, optimizer=None, scheduler=None, batch=config['batch']):
    def run():
      start = time.time()
      total_loss = 0
      total_acc = 0
      nums = len(dataloader) * batch
      for feature, label in tqdm(dataloader):
        label = label.to(device)
        if is_train:
          optimizer.zero_grad()
        input = {k: v.squeeze(1).to(device) for k, v in feature.items()}
        output = model(**input,labels=label)
        loss = output['loss']
        logits = output['logits']
        if is_train:
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()
        total_loss += loss.item()
        acc = (logits.argmax(dim=-1) == label.view(-1)).sum().item()
        total_acc += acc
      end = time.time()
      print(end - start)
      return total_loss / nums, total_acc / nums

    if is_train:
        model.train()
        return run()
    if not is_train:
        model.eval()
        with torch.no_grad():
            return run()


def train(model, train_dataset,
          val_dataset,
          optimizer=None,
          scheduler=None,
          batch=config['batch'],
          epochs=config['epochs'],
          device=device):
    # DataLoader根据batch_size获取数据，训练时选择打乱样本


    min_loss = float('inf')
    for epoch_num in range(epochs):
        train_loss, train_acc = run_one_epoch(model, train_dataloader, True, optimizer, scheduler)
        val_loss, val_acc = run_one_epoch(model, val_dataloader, False)
        print(f'Epoch: {epoch_num + 1}')
        print(f'Train loss: {train_loss:.4f}, Train acc: {train_acc:.4f}')
        print(f'Val loss: {val_loss:.4f}, Val acc: {val_acc:.4f}')

        new_loss = 0.8*train_loss + 0.2*val_loss
        if new_loss < min_loss:
          min_loss = new_loss
          print(epoch_num, "save model")
          torch.save(model.state_dict(), config['model_path'])


def predict(model, device=device, batch=config['batch']):
    # 加载模型
    model.load_state_dict(torch.load(config['model_path']))
    model.eval()
    test_dataset = SampleDataset(pd.read_csv(config['test_path']), is_train=False)
    # 加载测试集
    test_dataloader = DataLoader(test_dataset, batch_size=batch, shuffle=False)
    tensors = []
    model.eval()
    with torch.no_grad():
        for features in tqdm(test_dataloader):
            mask = features['attention_mask'].to(device)
            input_id = features['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            y_pred = output.argmax(dim=1)
            tensors.append(y_pred.view(-1))
    t = torch.concat(tensors, dim=-1).tolist()
    ans = [label_name[key] for key in t]
    return ans

In [None]:
# model = BertClassifier().to(device)

In [None]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 5, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
model.cuda()

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
          lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
          eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
          )

total_steps = 4 * len(train_dataloader)

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                        num_warmup_steps = 0, # Default value in run_glue.py
                        num_training_steps = total_steps)



In [None]:
train(model, trainset, validset, optimizer, scheduler)

100%|██████████| 596/596 [02:15<00:00,  4.39it/s]


135.64366722106934


100%|██████████| 75/75 [00:05<00:00, 14.61it/s]


5.142120122909546
Epoch: 1
Train loss: 0.1932, Train acc: 0.8826
Val loss: 0.0747, Val acc: 0.9600
0 save model


100%|██████████| 596/596 [02:15<00:00,  4.40it/s]


135.4735209941864


100%|██████████| 75/75 [00:05<00:00, 14.55it/s]


5.161846160888672
Epoch: 2
Train loss: 0.0580, Train acc: 0.9815
Val loss: 0.1411, Val acc: 0.9400
1 save model


100%|██████████| 596/596 [02:14<00:00,  4.42it/s]


134.96852040290833


100%|██████████| 75/75 [00:05<00:00, 14.54it/s]


5.164912462234497
Epoch: 3
Train loss: 0.0269, Train acc: 0.9908
Val loss: 0.0005, Val acc: 0.9933
2 save model


100%|██████████| 596/596 [02:16<00:00,  4.38it/s]


136.0315146446228


100%|██████████| 75/75 [00:05<00:00, 14.58it/s]


5.156157493591309
Epoch: 4
Train loss: 0.0090, Train acc: 0.9958
Val loss: 0.0066, Val acc: 0.9867
3 save model


100%|██████████| 596/596 [02:17<00:00,  4.32it/s]


137.87937903404236


100%|██████████| 75/75 [00:05<00:00, 14.62it/s]


5.136518239974976
Epoch: 5
Train loss: 0.0033, Train acc: 0.9983
Val loss: 0.0066, Val acc: 0.9867
4 save model


  9%|▉         | 56/596 [00:12<02:03,  4.36it/s]


KeyboardInterrupt: ignored

In [None]:
device

device(type='cuda')

## test

In [None]:
# 在测试集上看看训练的结果如何
def predict_demo(model, dataset):
  model.eval()
  test_dataloader = DataLoader(dataset, batch_size=2, shuffle=False, pin_memory=True)
  total_acc = 0
  total_num = 0
  with torch.no_grad():
    for feature, label in test_dataloader:
      label = label.to(device)
      input = {k: v.squeeze(1).to(device) for k, v in feature.items()}
      output = model(**input)
      logits = output['logits']
      total_num += label.size(0)
      y_hat = logits.argmax(dim=-1).to(device)
      total_acc += torch.sum(y_hat == label).item()
  return total_acc / total_num
      
predict_demo(model,testset)

0.9865771812080537

发现当epoch=3时，效果最好，故最后在整个训练集上再train一次

注意，在最后一次训练时，必须重新对 model, optim重新初始化

In [None]:
best_epochs = 3

In [None]:
df = pd.read_csv(config['train_path'])
all_dataset = SampleDataset(df)
finnal_dataloader = DataLoader(all_dataset, batch_size=config['batch'], shuffle=True, pin_memory=True)

In [None]:
def finnal_train(model, best_epochs, dataloader, optimizer, scheduler, batch=config['batch']):
  model.train()
  for epoch_num in range(best_epochs):
    start = time.time()
    total_loss = 0
    total_acc = 0
    nums = len(dataloader) * batch
    for feature, label in tqdm(dataloader):
      label = label.to(device)
      optimizer.zero_grad()
      input = {k: v.squeeze(1).to(device) for k, v in feature.items()}
      output = model(**input,labels=label)
      loss = output['loss']
      logits = output['logits']
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      optimizer.step()
      scheduler.step()
      total_loss += loss.item()
      acc = (logits.argmax(dim=-1) == label.view(-1)).sum().item()
      total_acc += acc
    end = time.time()
    print(end - start)
    train_loss = total_loss / nums
    train_acc = total_acc / nums

    print(f'Epoch: {epoch_num + 1}')
    print(f'Train loss: {train_loss:.4f}, Train acc: {train_acc:.4f}')
  
  torch.save(model.state_dict(), config['model_path'])

In [None]:
finnal_train(model, best_epochs, finnal_dataloader, optimizer, scheduler)

100%|██████████| 745/745 [02:45<00:00,  4.49it/s]


165.90052604675293
Epoch: 1
Train loss: 0.1392, Train acc: 0.9262


100%|██████████| 745/745 [02:44<00:00,  4.52it/s]


164.73130750656128
Epoch: 2
Train loss: 0.0400, Train acc: 0.9866


100%|██████████| 745/745 [02:46<00:00,  4.48it/s]


166.40321612358093
Epoch: 3
Train loss: 0.0142, Train acc: 0.9953


In [None]:
def predict(model, device=device, batch=config['batch']):
  # 加载模型
  model.load_state_dict(torch.load(config['model_path']))
  model.eval()
  test_dataset = SampleDataset(pd.read_csv(config['test_path']), is_train=False)
  # 加载测试集
  test_dataloader = DataLoader(test_dataset, batch_size=batch, shuffle=False)
  model.eval()
  ans = []
  with torch.no_grad():
    for feature in test_dataloader:
      input = {k: v.squeeze(1).to(device) for k, v in feature.items()}
      output = model(**input)
      logits = output['logits']
      ans += logits.argmax(dim=-1).tolist()
  ans = [label_name[key] for key in ans]
  return ans

In [None]:
ans = predict(model)

In [None]:
ans

In [None]:
sample_csv = data_path + 'BBC News Sample Solution.csv'
sample_df = pd.read_csv(sample_csv)
sample_df.head()

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,business
3,459,entertainment
4,1020,politics


In [None]:
len(ans)

735

In [None]:
sample_df.shape

(735, 4)

In [None]:
sample_df.iloc[:,1] = ans
sample_df.head()

Unnamed: 0,ArticleId,Category,-1,1
0,1018,sport,sport,sport
1,1319,tech,tech,tech
2,1138,sport,sport,sport
3,459,business,business,business
4,1020,sport,sport,sport


In [None]:
all(sample_df['Category'] == ans)

True

In [None]:
sample_df = sample_df.iloc[:,:2]
sample_df.head()

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,sport
3,459,business
4,1020,sport


In [None]:
all(sample_df['Category'] == ans)

True

In [None]:
sample_df.to_csv("submission.csv",index=False)