<a href="https://colab.research.google.com/github/Himagination/NLP_Transformers/blob/main/BERT_Fine_Tuning_Toxic_Comment_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pytorch-lightning --quiet
!pip install transformers --quiet
!pip install tf-estimator-nightly==2.8.0.dev2021122109
!pip install folium==0.2.1

Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[?25l[K     |▊                               | 10 kB 16.6 MB/s eta 0:00:01[K     |█▍                              | 20 kB 11.2 MB/s eta 0:00:01[K     |██▏                             | 30 kB 9.4 MB/s eta 0:00:01[K     |██▉                             | 40 kB 8.5 MB/s eta 0:00:01[K     |███▌                            | 51 kB 4.3 MB/s eta 0:00:01[K     |████▎                           | 61 kB 5.1 MB/s eta 0:00:01[K     |█████                           | 71 kB 5.5 MB/s eta 0:00:01[K     |█████▊                          | 81 kB 5.7 MB/s eta 0:00:01[K     |██████▍                         | 92 kB 6.3 MB/s eta 0:00:01[K     |███████                         | 102 kB 5.1 MB/s eta 0:00:01[K     |███████▉                        | 112 kB 5.1 MB/s eta 0:00:01[K     |████████▌                       | 122 kB 5.1 MB/s eta 0:00:01[K     |█

In [4]:
# Imports
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import transformers

import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

In [5]:
%matplotlib inline
RANDOM_SEED = 42
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
pl.seed_everything(RANDOM_SEED)

Global seed set to 42


42

## Data

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
df = pd.read_csv("/content/drive/MyDrive/train.csv")
df.head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [9]:
train_df, val_df = train_test_split(df, test_size=0.05)
train_df.shape, val_df.shape

((151592, 8), (7979, 8))

In [10]:
LABEL_COLUMNS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [11]:
train_df[LABEL_COLUMNS].sum()

toxic            14546
severe_toxic      1515
obscene           8028
threat             465
insult            7467
identity_hate     1334
dtype: int64

In [12]:
# Sampling to use less data
train_df = train_df.sample(100_000)
train_df[LABEL_COLUMNS].sum()

toxic            9625
severe_toxic      983
obscene          5258
threat            302
insult           4947
identity_hate     903
dtype: int64

In [13]:
sample_row = df.iloc[16]
sample_comment = sample_row.comment_text
sample_label = sample_row[LABEL_COLUMNS]

print(f"Sample comment: {sample_comment}\n")
print(f"Sample label: {sample_label.to_dict()}")

Sample comment: Bye! 

Don't look, come or think of comming back! Tosser.

Sample label: {'toxic': 1, 'severe_toxic': 0, 'obscene': 0, 'threat': 0, 'insult': 0, 'identity_hate': 0}


In [14]:
BERT_MODEL_NAME = "bert-base-cased"
tokenizer = transformers.BertTokenizerFast.from_pretrained(BERT_MODEL_NAME)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
encoding = tokenizer.encode_plus(
    sample_comment, 
    add_special_tokens=True, 
    max_length=512, 
    return_token_type_ids=False, 
    padding="max_length", 
    return_attention_mask=True, 
    return_tensors="pt"
)

In [17]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [18]:
encoding["input_ids"].shape, encoding["attention_mask"].shape

(torch.Size([1, 512]), torch.Size([1, 512]))

In [21]:
class ToxicCommentsDataset(Dataset):
  def __init__(self, data: pd.DataFrame, tokenizer: transformers.BertTokenizerFast, 
               max_token_len: int = 128):
    self.data = data
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]
    comment_text = data_row.comment_text
    labels = data_row[LABEL_COLUMNS]

    encoding = self.tokenizer.encode_plus(
        comment_text, 
        add_special_tokens = True, 
        max_length = self.max_token_len, 
        return_token_type_ids = False, 
        padding = "max_length", 
        truncation = True, 
        return_attention_mask = True, 
        return_tensors = "pt"
    )
    return dict(
        comment_text = comment_text, 
        input_ids = encoding["input_ids"].flatten(), 
        attention_mask = encoding["attention_mask"].flatten(), 
        labels = torch.FloatTensor(labels)
    )

In [22]:
train_dataset = ToxicCommentsDataset(train_df, tokenizer)
sample_item = train_dataset[0]

In [23]:
sample_item.keys()

dict_keys(['comment_text', 'input_ids', 'attention_mask', 'labels'])

In [24]:
sample_item["comment_text"]

'India related links \n\n • Talk • [ Reply]'

In [25]:
sample_item["labels"]

tensor([0., 0., 0., 0., 0., 0.])

In [26]:
sample_item["input_ids"].shape

torch.Size([128])

In [29]:
bert_model = transformers.BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
sample_item["input_ids"].unsqueeze(dim=0).shape

torch.Size([1, 128])

In [31]:
prediction = bert_model(sample_item["input_ids"].unsqueeze(dim=0), 
                        sample_item["attention_mask"].unsqueeze(dim=0))

In [32]:
class ToxicCommentDataModule(pl.LightningDataModule):
  def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=128):
    super().__init__()
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.max_token_len = max_token_len

  def setup(self):
    self.train_dataset = ToxicCommentsDataset(
        self.train_df, 
        self.tokenizer, 
        self.max_token_len
    )

    self.test_dataset = ToxicCommentsDataset(
        self.test_df, 
        self.tokenizer, 
        self.max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset, 
        batch_size=self.batch_size, 
        shuffle=True, 
        num_workers=4
    )

  def val_dataloader(self):
    return DataLoader(
        self.test_dataset, 
        batch_size=1, 
        num_workers=4
    )

  def test_dataloader(self):
    return DataLoader(self.test_dataset, 
                      batch_size=1, 
                      num_workers=4)

In [33]:
data_module = ToxicCommentDataModule(train_df, val_df, tokenizer)