In [1]:
# install the package for transformers 

!pip install transformers 

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 7.0MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 54.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 51.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=05677

In [2]:
# create a folder in which we save our trained model
!mkdir trained

In [3]:
# setting the model name and parameters

MODEL_PARAMS = {'MODEL_NAME': 'distilbert-base-uncased', 
                'N_EPOCHS': 3, 
                'BATCH_SIZE': 32, 
                'MAX_LENGTH': 128, 
                'RDN_NUM' : 123, 
                'LEARNING_RATE' : 2e-5, 
                'NUM_LABELS' : 6}

In [4]:
# building tokenizer and model
from transformers import AdamW, BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained(MODEL_PARAMS['MODEL_NAME'], do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(MODEL_PARAMS['MODEL_NAME'], 
                                                      num_labels=MODEL_PARAMS['NUM_LABELS'])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForSequenceClassification: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.

In [5]:
# send our model to the GPU device
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [6]:
# computing max sequence length to have an overview of the dataset when tokenized by our models

import pandas

df = pandas.read_csv("train_small.csv", encoding="ISO-8859-1")
df["len_Tokenized"] = df["comment_text"].apply(tokenizer.tokenize).apply(len)
computed_max_sequence_length = df["len_Tokenized"].max()

print("The computed max sequence length is: ", computed_max_sequence_length)

# remove added column
df.drop('len_Tokenized',
        axis='columns', 
        inplace=True)

The computed max sequence length is:  2500


In [7]:
# preparing data for the training stage

from sklearn.model_selection import train_test_split
 

label_names = list(df.columns[2:])
labels = df[label_names].values
comments = df.comment_text.values

# tokenization plus padding
encodings = tokenizer.batch_encode_plus(comments, 
                                        max_length=MODEL_PARAMS['MAX_LENGTH'], 
                                        truncation=True, 
                                        padding=True)

train_inputs, val_inputs, \
train_labels, val_labels, \
train_masks, val_masks = train_test_split(encodings['input_ids'], 
                                          labels, 
                                          encodings['attention_mask'],
                                          random_state=MODEL_PARAMS['RDN_NUM'], 
                                          test_size=0.2)

In [8]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# preparing train data
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_data = TensorDataset(train_inputs, train_masks, train_labels)  # Creates a TensorDataset from a vector of tensors.
train_sampler = RandomSampler(train_data)  # A Sampler that returns random indices.

# preparing validation data
val_inputs = torch.tensor(val_inputs)
val_labels = torch.tensor(val_labels)
val_masks = torch.tensor(val_masks)
val_data = TensorDataset(val_inputs, val_masks, val_labels)  # Creates a TensorDataset from a vector of tensors.
val_sampler = SequentialSampler(val_data)  # A Sampler that returns indices sequentially.

# a Python iterable over a dataset
train_dataloader = DataLoader(train_data, 
                              sampler=train_sampler, 
                              batch_size=MODEL_PARAMS['BATCH_SIZE'])
val_dataloader = DataLoader(val_data, 
                            sampler=val_sampler, 
                            batch_size=MODEL_PARAMS['BATCH_SIZE'])

In [9]:
# setting the optimization parameters

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, 
                  lr=MODEL_PARAMS['LEARNING_RATE'], 
                  correct_bias=True)


In [10]:
"""
https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html

This loss combines a Sigmoid layer and the BCELoss (Binary Cross Entropy Loss) in one single class. 
"""

from torch.nn import BCEWithLogitsLoss
loss_func = BCEWithLogitsLoss()

In [13]:
from sklearn.metrics import f1_score

def evaluate(val_dataloader):
  
  model.eval()  #  turn off some specific parts of the model (belong to training and inference phases)

  true_labels, pred_labels = list(), list()
  for i, batch in enumerate(val_dataloader):
      batch = tuple(t.to(DEVICE) for t in batch)
      input_ids, input_mask, labels = batch

      with torch.no_grad():  # Disabling gradient calculation.
          outs = model(input_ids, token_type_ids=None, attention_mask=input_mask)
          pred_label = torch.round(torch.sigmoid(outs[0]))

      true_labels.append(labels.to('cpu').numpy())
      pred_labels.append(pred_label.to('cpu').numpy())

  pred_labels = [pl for pt in pred_labels for pl in pt]  # flatten prediction tensors
  true_labels = [tl for tt in true_labels for tl in tt]  # flatten true tensors
  
  print('Macro F1-score: ', f1_score(true_labels, pred_labels, average='macro'))

  return true_labels, pred_labels

In [14]:
from tqdm import trange
from torch import cuda

DEVICE = 'cuda' if cuda.is_available() else 'cpu'

train_loss_set = []
for _ in trange(MODEL_PARAMS['N_EPOCHS'], desc="Epoch"):
  model.train()

  # Train step
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(DEVICE) for t in batch)
    input_ids, input_mask, labels = batch

    optimizer.zero_grad()

    # Forward pass for multilabel classification
    outputs = model(input_ids, 
                    token_type_ids=None, 
                    attention_mask=input_mask)
    logits = outputs[0]
    loss = loss_func(
                     # a manual rescaling weight given to the loss of each batch element (a Tensor of size nbatch)
                     logits.view(-1, MODEL_PARAMS['NUM_LABELS']), 
                     # a weight of positive examples (a vector with length equal to the number of classes)
                     labels.type_as(logits).view(-1, MODEL_PARAMS['NUM_LABELS']))
    train_loss_set.append(loss.item())

    loss.backward() # computes dloss/dx for every x which has requires_grad=True.
    optimizer.step() # updating parameters while using GPU

    tr_loss += loss.item()
    nb_tr_examples += input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

  # Evaluation step
  evaluate(val_dataloader)

print("Here you go! Your model is trained."
torch.save(model.state_dict(), 'trained/model')


Epoch:   0%|          | 0/3 [00:00<?, ?it/s][A

Train loss: 0.14399939549380336



Epoch:  33%|███▎      | 1/3 [03:31<07:03, 211.87s/it][A

Macro F1-score:  0.09665391654410775
Train loss: 0.11130191380286525



Epoch:  67%|██████▋   | 2/3 [07:04<03:32, 212.15s/it][A

Macro F1-score:  0.3148833791742907
Train loss: 0.07060434877679779



Epoch: 100%|██████████| 3/3 [10:37<00:00, 212.45s/it]

Macro F1-score:  0.32486132995909534





In [15]:
# load the model, input comment and obtain the classes
# This cell is stand-alone

import torch
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

characteristics = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

MODEL_PARAMS = {'MODEL_NAME': 'distilbert-base-uncased', 
                'MAX_LENGTH': 128, 
                'NUM_LABELS' : 6}

new_comment = input("Please make a comment on your life!")  # prompt

with torch.no_grad():  # Disabling gradient calculation.
  load_model = torch.load('trained/model')
  trained_tokenizer = BertTokenizer.from_pretrained(MODEL_PARAMS['MODEL_NAME'], 
                                                    state_dict = load_model,
                                                    do_lower_case=True)
  trained_model = BertForSequenceClassification.from_pretrained(MODEL_PARAMS['MODEL_NAME'], 
                                                state_dict = load_model,
                                                num_labels=MODEL_PARAMS['NUM_LABELS'])
  comment_tokenized = trained_tokenizer.encode_plus(new_comment, 
                                                    max_length=MODEL_PARAMS['MAX_LENGTH'],
                                                    truncation=True,
                                                    return_tensors='pt')
  output = trained_model(comment_tokenized['input_ids'], comment_tokenized['attention_mask'])
  pred_label = torch.sigmoid(output[0])  # passing logits to the sigmoid function
  round_values = [round(i) for i in pred_label.tolist()[0]]
  print("Your comment is classified as %s" % list(zip(characteristics, round_values)))

Please make a comment on your life!My language is full of slangs and fucks
Your comment is classified as [('toxic', 1), ('severe_toxic', 0), ('obscene', 1), ('threat', 0), ('insult', 1), ('identity_hate', 0)]
