# Fine Tune BERT For  MultiClass Text Classification
Author: Nelson LIN (nelsonlin0321@outlook.com)

In [307]:
# import libaries
import torch
from torch import cuda
from torch.utils.data import Dataset,DataLoader

In [308]:
import os
import json
import random
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn import metrics
from collections import Counter

In [309]:
# !pip install transformers

In [310]:
from transformers import AutoTokenizer,DistilBertModel,default_data_collator

In [311]:
device = "cuda" if cuda.is_available() else "cpu"

In [312]:
device

'cpu'

In [313]:
model_name =  "distilbert-base-uncased"

## 1) Import Data



In [314]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [315]:
# download from https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
data_path = "/content/drive/My Drive/Colab Notebooks/Data/NewsAggregatorDataset/newsCorpora.csv"

In [316]:
sample_size = 1000

In [317]:
columns = " ID \t TITLE \t URL \t PUBLISHER \t CATEGORY \t STORY \t HOSTNAME \t TIMESTAMP".split("\t") 

In [318]:
columns = [col.strip() for col in columns]

In [319]:
df = pd.read_csv(data_path,sep = "\t",names =columns)

In [320]:
df = shuffle(df)
df = df.head(sample_size)

In [321]:
category_dict = {       "e":0,
                 "b":1,
                 "t":2,
                 "m":3}

In [322]:
df['label'] = df['CATEGORY'].map(category_dict)

In [323]:
df.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP,label
70258,70334,Michael Lewis: Wall Street Is Cheating The Sys...,http://www.inquisitr.com/1194169/michael-lewis...,The Inquisitr,b,dI0sNpL3ozAVfgMyCWX_cuVlOgAJM,www.inquisitr.com,1396283154708,1
10451,10452,Download VLC Media Player Free for Windows 8,http://societyandreligion.com/download-vlc-med...,Society and Religion,t,d40S-xOIhKkqVaMhQMBy9BXYz83IM,societyandreligion.com,1394716110597,2
147450,147786,Danny Boyle to direct Steve Jobs movie?,http://www.digitalspy.com/movies/news/a565816/...,Digital Spy,e,drNw5N1I4wpYFxM084jf-nbrZq0cM,www.digitalspy.com,1398167148959,0
320031,320491,Extra Scoop: Rob Kardashian Has Reportedly Not...,http://www.extratv.com/2014/06/25/extra-scoop-...,Extra,e,d1JyGQyGuuZXKZM9TVaiAuCteYRWM,www.extratv.com,1403798566434,0
361842,362302,Analysis: SeaWorld's mammals live longer,http://tbo.com/ap/analysis-seaworlds-mammals-l...,Tbo.com,t,dXJaauGEc1KdzFMPGiRIGTrQ9OK8M,tbo.com,1404646274525,2


## 2) Tokenization Features Engineering

In [324]:
max_length = 128

In [325]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [326]:
title = df['TITLE'].iloc[5]

In [327]:
inputs = tokenizer(
    text = title,
    text_pair = None,
    add_special_tokens = True,
    max_length = max_length,
    padding = "max_length",
    return_token_type_ids = False,
    truncation = True,
    )

In [328]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [329]:
class CustomizedDataset(Dataset):
    def __init__(self,dataframe,tokenizer,max_length,device):
        self.len = len(dataframe)
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device
    def __len__(self):
        return self.len
    
    def __getitem__(self,index):
        title = self.df['TITLE'].iloc[index]
        label = self.df['label'].iloc[index]

        if not isinstance(title,str):
            title  = title.to_list()
            label = label.to_list()

        inputs = self.tokenizer(
            text = title,
            text_pair = None,
            add_special_tokens = True,
            max_length = self.max_length,
            padding = "max_length",
            return_token_type_ids = False,
            truncation = True,
            return_tensors = "pt",
            )

        inputs = {k:v.to(self.device) for (k,v) in inputs.items()}

        return inputs,torch.tensor(label).to(self.device)

In [330]:
train_size = 0.7
train_df = df.sample(frac= train_size,random_state = 12)
test_df = df.drop(train_df.index)

In [331]:
len(train_df),len(test_df)

(700, 300)

In [332]:
batch_size = 6

In [333]:
train_dataset = CustomizedDataset(train_df,tokenizer,max_length,device)
train_loader = DataLoader(train_dataset,batch_size)

In [334]:
test_dataset = CustomizedDataset(test_df,tokenizer,max_length,device)
test_loader = DataLoader(test_dataset,batch_size,drop_last = True)

## 3) Define Model

In [335]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self,config):
        super(DistilBERTClass,self).__init__()
        self.bert_layer = DistilBertModel.from_pretrained(config.model_name)
        self.pre_classifier = torch.nn.Linear(768,768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifer = torch.nn.Linear(768,config.num_class)

    def forward(self,inputs):

        outputs = self.bert_layer(**inputs)
        last_hidden_state = outputs.last_hidden_state
        pooler = last_hidden_state[:,0]
        pooler = self.pre_classifier(pooler)
        pooler = self.dropout(pooler)
        outputs = self.classifer(pooler)

        return outputs


In [336]:
class config:
    model_name = model_name
    num_class = len(category_dict.items())

In [337]:
model = DistilBERTClass(config)
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [338]:
# model(sample_inputs)

In [339]:
learning_rate = 1e-05

In [340]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params = model.parameters(),lr = learning_rate )

##4) Fine Tune Model

In [341]:
"""test"""
for data,labels in test_loader:
    break

In [342]:
data['attention_mask'].shape

torch.Size([6, 1, 128])

In [343]:
"""test"""
total_loss = 0
for data,labels in test_loader:
    input_ids = torch.squeeze(data['input_ids'])
    attention_mask = torch.squeeze(data['attention_mask'])
    inputs = {'input_ids':input_ids,'attention_mask':attention_mask}
    outputs = model(inputs)
    loss = loss_function(outputs,labels)
    total_loss += loss.item()
    break

In [344]:
outputs

tensor([[ 0.1641, -0.2112, -0.0841,  0.3998],
        [ 0.2728, -0.1407, -0.1243,  0.1284],
        [ 0.0379, -0.2294, -0.0113,  0.3652],
        [ 0.0944, -0.2889, -0.0310,  0.2839],
        [ 0.1485, -0.1570, -0.2184,  0.4245],
        [ 0.0174, -0.2914, -0.1358,  0.3840]], grad_fn=<AddmmBackward0>)

In [345]:
def evaluate():
    model.eval()
    average_acc = []
    average_loss = []
    total_loss = 0
    for data,labels in test_loader:
        input_ids = torch.squeeze(data['input_ids'])
        attention_mask = torch.squeeze(data['attention_mask'])
        inputs = {'input_ids':input_ids,'attention_mask':attention_mask}
        with torch.no_grad():
            outputs = model(inputs)
            loss = loss_function(outputs,labels)
            average_loss.append(loss)

        true = labels.data.cpu()
        pred = torch.max(outputs,1)[1].cpu()

        test_acc = metrics.accuracy_score(true,pred)
        average_acc.append(test_acc)
    
    return np.mean(average_acc),np.mean(average_loss)

In [346]:
epoches = 5

In [350]:
len(train_loader)

117

In [351]:
def train():
    total_batch = 0
    model.train()
    for epoch in range(epoches):
        print("Epoc {} / {}".format(epoch+1,epoches))

        for data,labels in train_loader:
            input_ids = torch.squeeze(data['input_ids'])
            attention_mask = torch.squeeze(data['attention_mask'])
            inputs = {'input_ids':input_ids,'attention_mask':attention_mask}
            outputs = model(inputs)

            optimizer.zero_grad()
            loss = loss_function(outputs,labels)
            loss.backward()
            optimizer.step()

            if (total_batch+1) % 100 ==0:
                true = labels.data.cpu()
                pred = torch.max(outputs,1)[1].cpu()
                train_acc = metrics.accuracy_score(true,pred)
                test_acc,test_loss = evaluate()
                log = "Total Batch: {0} ,Train Loss {1} ,Train Accuracy {3} ,Test Loss {4 Test Accuracy {5}}"
                print(log.format(total_batch,loss.item(),train_acc,test_acc,test_loss))
                model.train()
            total_batch +=1

In [None]:
train()

Epoc 1 / 5


# Save Model

In [None]:
model_save_path = "/content/drive/My Drive/Colab Notebooks/Models/newsCorpora/newsCorpora.bin"
 # data folder
 torch.save(model,model_save_path)