<a href="https://colab.research.google.com/github/GuanRuLai/Python-Deep-Learning/blob/main/BERT(Multi_Label_Classification).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preprocessing

## Load data

In [27]:
academy_titles = []
job_titles = []

with open("academy_titles.txt", "r", encoding= "utf-8") as f:
    for l in f:
      academy_titles.append(l.strip()) # remove spaces of head and tail

with open("job_titles.txt", "r", encoding= "utf-8") as f:
    for l in f:
      job_titles.append(l.strip()) # remove spaces of head and tail

print(academy_titles[:5])
print(job_titles[:5])

['北师教育学，你我一起努力，让胜利酣畅淋漓。', '考博英语词汇', '出售人大新闻学院2015年考研权威资料', '【脑科院 郭桃梅课题组】科研助理招聘', '管理学院的同学帮帮忙呐～']
['【字节跳动内推】校招岗位全面开放，帮查进度！', '招聘兼职/ 笔试考务 /200-300 每人', '国企出版社招聘坐班兼职生', '【在线早教】教研实习生招聘', '【兼职】心理学公众号寻兼职写手']


## Build dataset & Split independent variables and dependent variable

In [28]:
all_data = []

for title in academy_titles:
    all_data.append([title, 0])

for l in job_titles:
    all_data.append([title, 1])

print(all_data[:5])

[['北师教育学，你我一起努力，让胜利酣畅淋漓。', 0], ['考博英语词汇', 0], ['出售人大新闻学院2015年考研权威资料', 0], ['【脑科院 郭桃梅课题组】科研助理招聘', 0], ['管理学院的同学帮帮忙呐～', 0]]


## Split training set and testing set(preliminary)

In [29]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)
print(len(train_data))
print(len(test_data))

5686
1422


# Import necessary library

In [30]:
import os
import time
import random
import torch
import torch.nn.functional as F
from torch import nn
from tqdm import tqdm

from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

# Neural network processing

## Check if there is GPU to use

In [31]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("There are %d GPU(s) available." % torch.cuda.device_count())
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU instead.")

There are 1 GPU(s) available.


## Custom dataset for BERT tokenization

In [32]:
# load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

class MyDataset(torch.utils.data.Dataset):
  def __init__(self, examples):
    self.examples = examples

  def __len__(self):
    return len(self.examples)

  def __getitem__(self, index):
    example = self.examples[index]
    title = example[0]
    label = example[1]

    # convert title into a format suitable for input to a model
    r = tokenizer.encode_plus(
        title,
        max_length=128,
        padding="max_length"
    )

    return title, label, index

## Define collate function to handle variable-length sequences

In [39]:
def collate_fn(batch):
  r = tokenizer([b[0] for b in batch], padding=True) #  ensure that all sequences in the batch are padded to the length of the longest sequence in the batch
  input_ids = torch.LongTensor(r["input_ids"]) # token ids of each title
  attention_mask = torch.LongTensor(r["attention_mask"]) # identify the token is actual data(1) or padded value(0)
  label = torch.LongTensor([b[1] for b in batch])
  indices = [b[2] for b in batch]

  return input_ids, attention_mask, label, indices

## Set hyperparameters

In [40]:
max_train_epochs = 5
warmup_proportion = 0.05 # the proportion of learning rate growing from slow to fast
gradient_accumulation_steps = 4 # number of gradient steps need to accumulate before updating weights
batch_size = 8
data_workers = 2 # number of subprocesses to use for loading data

learning_rate = 2e-5
weight_decay = 0.01
max_grad_norm = 1
cur_time = time.strftime("%Y-%m-%d_%H-%M-%S")

## Split training set and testing set(after tokenization)

In [41]:
train_data = MyDataset(train_data)
test_data = MyDataset(test_data)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=data_workers, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=data_workers, collate_fn=collate_fn)

## Define model

In [42]:
model = BertForSequenceClassification.from_pretrained("bert-base-chinese")
model.to(device)

t_total = len(train_loader) // gradient_accumulation_steps * max_train_epochs + 1 # total number of training steps(+1 to ensure the last step not being ignored)
num_warmup_steps = int(t_total * warmup_proportion) # number of warmup steps
print("warmup steps : %d" % num_warmup_steps)

no_decay = ["bias", "LayerNorm.weight"] # specify parameters which aren't fit for weight decaying
param_optimizer = list(model.named_parameters()) # get all parameters
optimizer_grouped_parameters = [
    {"params":[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": weight_decay},
    {"params":[p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
]

# define optimizer
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False)

# define learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


warmup steps : 44


## Model training & evaluation

In [45]:
for epoch in range(max_train_epochs):
  b_time = time.time() # start time
  model.train() # weights can be modified

  correct = 0
  total = 0

  for step, batch in enumerate(tqdm(train_loader)):
    input_ids, attention_mask, label = (b.to(device) for b in batch[:-1]) # ignore the last element "indices"
    optimizer.zero_grad() # return zero of every previous batch

    loss = model(input_ids, attention_mask=attention_mask, labels=label)[0]
    logits = model(input_ids, attention_mask=attention_mask, labels=label)[1]
    loss.backward() # calculate gradient(min loss weights)

    if (step + 1) % gradient_accumulation_steps == 0: # check if we achieve the accumulative steps
      optimizer.step() # update weights
      scheduler.step() # update learning rate

    _, predicted = torch.max(logits.data, 1) # get the index of max value in each row of axis 1
    total += label.size(0) # get the number of samples
    correct += (predicted == label.squeeze()).sum().item()


  accuracy = correct / total
  print(f"Epoch: {epoch} | Loss: {loss:.4f} | Accuracy: {accuracy:.4f} | Time: {(time.time() - b_time) / 60:.2f} min")

100%|██████████| 711/711 [00:43<00:00, 16.47it/s]


Epoch: 0 | Loss: 0.0088 | Accuracy: 0.9889 | Time: 0.72 min


100%|██████████| 711/711 [00:53<00:00, 13.38it/s]


Epoch: 1 | Loss: 0.0240 | Accuracy: 0.9889 | Time: 0.89 min


100%|██████████| 711/711 [00:54<00:00, 12.99it/s]


Epoch: 2 | Loss: 0.5767 | Accuracy: 0.9889 | Time: 0.91 min


100%|██████████| 711/711 [00:43<00:00, 16.44it/s]


Epoch: 3 | Loss: 0.0216 | Accuracy: 0.9889 | Time: 0.72 min


100%|██████████| 711/711 [00:43<00:00, 16.40it/s]

Epoch: 4 | Loss: 0.0141 | Accuracy: 0.9889 | Time: 0.72 min





## Test evaluation

In [49]:
model.eval() # weights cannot be modified(frozen)

y_true = []
y_pred = []

with torch.no_grad(): # close the gradient calculation mechanism

  correct = 0
  total = 0

  for step, batch in enumerate(tqdm(test_loader)):
    input_ids, attention_mask, label = (b.to(device) for b in batch[:-1])

    logits = model(input_ids, attention_mask=attention_mask, labels=label)[1]

    _, predicted = torch.max(logits.data, 1)
    total += label.size(0)
    correct += (predicted == label.squeeze()).sum().item()

    y_true.extend(label.squeeze().cpu().numpy())
    y_pred.extend(predicted.cpu().numpy())

  accuracy = correct / total
  print(f"Accuracy: {accuracy:.4f}")

100%|██████████| 178/178 [00:03<00:00, 45.09it/s]

Accuracy: 0.9887





## Answer prediction

In [50]:
import pandas as pd

results_df = pd.DataFrame({
    "Y_true": y_true,
    "Y_pred": y_pred
})
print(results_df)

      Y_true  Y_pred
0          1       1
1          1       1
2          1       1
3          1       1
4          1       1
...      ...     ...
1417       1       1
1418       0       0
1419       1       1
1420       0       0
1421       0       0

[1422 rows x 2 columns]
