# Multi-label text classification with HuggingFace Transformers

This notebook demonstrates the use of the HuggingFace
`transformers` library to do perform multi-label text
classification.

## The toxicity dataset

The dataset we'll use is one that Kaggle featured for a
[Toxic Comment Classification Challenge](https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/overview). The data are comments from Wikipedia's talk page
edits, where each comment is labeled for different types of
toxicity, including:

* threats
* obscenity
* insults
* identity-based hate

This dataset is a *multi-label* dataset, meaning each comment
can be labeled to contain multiple types of toxicity.

## Libraries used

We'll train our multi-label classification model using HuggingFace
transformers with PyTorch as our deep learning framework.

For preprocessing data we'll use Pandas.

In [1]:
import numpy as np
import pandas as pd
import torch

from torch.utils.data import DataLoader, Dataset

from transformers import (
    AutoModel,
    AutoTokenizer,
)

## Preprocessing the data

In [2]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
df.set_index('id', inplace=True)
df.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
label = df[df.columns[1:]].apply(lambda x: x.to_list(), axis=1)
datadf = pd.DataFrame(data={
    'comment_text': df.comment_text,
    'label': label,
})
datadf.head()

Unnamed: 0_level_0,comment_text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [5]:
train_dataset = datadf.sample(frac=0.8)

test_dataset = datadf.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

In [6]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        comment_text = self.dataframe.comment_text.iloc[index]
        comment_text = ' '.join(comment_text.split())

        targets = self.dataframe.label.iloc[index]

        inputs = self.tokenizer(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
        )
        ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'targets': torch.tensor(targets, dtype=torch.float)
        }

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    'distilbert-base-uncased',
    use_fast=True,
)

In [8]:
train_set = CustomDataset(train_dataset, tokenizer)
test_set = CustomDataset(test_dataset, tokenizer)

In [9]:
train_loader = DataLoader(
    train_set,
    batch_size=8,
    shuffle=True,
    num_workers=0,
)
test_loader = DataLoader(
    test_set,
    batch_size=8,
    shuffle=True,
    num_workers=0,
)

## Creating the classification model

In [10]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(
            'distilbert-base-uncased',
        )
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)

    def forward(self, ids, attention_mask):
        output_1 = self.l1(ids, attention_mask=attention_mask)
        output_2 = self.l2(output_1[0][:, 0])
        output = self.l3(output_2)
        return output


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = DistilBERTClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

## Training the model

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [12]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [13]:
def train(epoch):
    model.train()
    for idx, data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if idx % 5000 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [14]:
for epoch in range(5):
    train(epoch)

Epoch: 0, Loss:  0.6866742968559265
