# 🤗 distilBERT fine tuning for text classification

Fine tuning huggingface distilBERT for the disaster tweet classification kaggle competition. This notebook belows heavily borrows code from Chris McCormick and Nick Ryan: https://mccormickml.com/2019/07/22/BERT-fine-tuning/#41-bertforsequenceclassification

In [3]:
import pandas as pd
import numpy as np

from pathlib import Path

import re
import string

import random

import time
import datetime

In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    AdamW,
    get_linear_schedule_with_warmup
)

In [5]:
if torch.cuda.is_available():        
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [6]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [11]:
from nltk.tokenize.casual import remove_handles

In [12]:
def tweet_preprocessor(text):
    """
    Applies NLTK tweet tokenizer and removes:
        - numbers
        - extra whitespaces
        
        - periods
        - semicolons
        - non-ascii characters
        - urls
        - strip hashtag
    

        Args:
            text (string): Raw text

        Returns:
            string: Filtered raw text
        """
    text = remove_handles(text)
    
    text = re.sub(r'\x89Û', '', text)
    text = re.sub(r'\x89ÛÒ', '', text)
    text = re.sub(r'\x89ÛÒ', '', text)
    text = re.sub(r'\x89ÛÏ', '', text)
    text = re.sub(r'\x89Ûª', '', text)
    text = re.sub(r'\x89Û_', '', text)
    text = re.sub(r'\x89ã¢', '', text)
    
    text = re.sub(r'\x9d', '', text)
    text = re.sub(r'\&amp;', 'and', text)
    text = re.sub(r'https?:\/\/t.co\/[A-Za-z0-9]+', '', text)
    text = re.sub(r'#(?=\w+)', '', text)
    text = re.sub(r'\?+', '?', text)
    text = re.sub(r'\s+', ' ', text)

    return text

## Import data
Assume that all data is contained within the working directory. Data consists of `train.csv` which includes labels and `test.csv` for submission.

In [13]:
df_train = pd.read_csv('train.csv', index_col='id')
df_test = pd.read_csv('test.csv', index_col='id')

In [14]:
text = df_train['text'].to_list()
labels = df_train['target'].values

In [15]:
text = [tweet_preprocessor(sent) for sent in text]

In [16]:
text, labels = shuffle(text, labels, random_state=42)

## Load BERT model from transformers and apply tokenization

In [17]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2,
    output_attentions = False,
    output_hidden_states = False
)
model.cuda()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (dropout): Dropout(p=0.1)
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p

In [18]:
text_tokenized = np.array(
    [tokenizer.encode(sent, add_special_tokens=True) for sent in text])

### Generate input tensors for training and validation

Pad vectors

In [19]:
text_tokenized.shape

(7613,)

In [20]:
labels.shape

(7613,)

In [21]:
max_len = 0
for sent in text_tokenized:
    if len(sent) > max_len:
        max_len = len(sent)
input_ids = np.array(
    [sent + [0]*(max_len-len(sent)) for sent in text_tokenized])
attention_masks = np.where(input_ids != 0, 1, 0)

In [22]:
input_ids.shape

(7613, 67)

In [23]:
labels.shape

(7613,)

In [24]:
attention_masks.shape

(7613, 67)

Generate train/test splits

In [25]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, labels, random_state=42, test_size=0.1)

Convert to torch tensors

In [26]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

Create `DataLoader` instances fro validation and train sets

In [27]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(
    train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(
    validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(
    validation_data, sampler=validation_sampler, batch_size=batch_size)

## Train model

Use AdamW optimizer

In [28]:
optimizer = AdamW(
    model.parameters(),
    lr = 2e-5, 
    eps = 1e-8
)

In [29]:
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps)

In [30]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [32]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):
    
    print(f'======== Epoch {epoch_i + 1} / {epochs} ========')
    print('Training...')

    t0 = time.time()
    total_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            
            print(
                f'  Batch {step:>5,}  of  {len(train_dataloader):>5,}.    Elapsed: {elapsed}.')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)            
    
    loss_values.append(avg_train_loss)

    print(f'  Average training loss: {avg_train_loss:.2f}')
    print(f'  Training epoch took: {format_time(time.time() - t0)}')
        
    print('')
    print('Running Validation...')

    t0 = time.time()

    model.eval()

    eval_loss, eval_score = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():        
            outputs = model(b_input_ids, attention_mask=b_input_mask)
        
        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        preds = np.argmax(logits, axis=1)
        score = f1_score(preds, label_ids)
        
        eval_score += score

        nb_eval_steps += 1

    print(f'  Average F1 score: {eval_score/nb_eval_steps:.2f}')
    print(f'  Validation took: {format_time(time.time() - t0)}')

Training...
  Batch    40  of    215.    Elapsed: 0:00:21.
  Batch    80  of    215.    Elapsed: 0:00:42.
  Batch   120  of    215.    Elapsed: 0:01:03.
  Batch   160  of    215.    Elapsed: 0:01:23.
  Batch   200  of    215.    Elapsed: 0:01:42.
  Average training loss: 0.44
  Training epcoh took: 0:01:49

Running Validation...
  Average F1 score: 0.80
  Validation took: 0:00:04
Training...
  Batch    40  of    215.    Elapsed: 0:00:20.
  Batch    80  of    215.    Elapsed: 0:00:40.
  Batch   120  of    215.    Elapsed: 0:01:01.
  Batch   160  of    215.    Elapsed: 0:01:23.
  Batch   200  of    215.    Elapsed: 0:01:42.
  Average training loss: 0.33
  Training epcoh took: 0:01:49

Running Validation...
  Average F1 score: 0.79
  Validation took: 0:00:04
Training...
  Batch    40  of    215.    Elapsed: 0:00:19.
  Batch    80  of    215.    Elapsed: 0:00:39.
  Batch   120  of    215.    Elapsed: 0:00:58.
  Batch   160  of    215.    Elapsed: 0:01:18.
  Batch   200  of    215.    Elaps

## Output results

In [33]:
text_test = df_test['text'].to_list()

In [34]:
text_test = [tweet_preprocessor(sent) for sent in text_test]

In [35]:
text_test_tokenized = np.array(
    [tokenizer.encode(sent, add_special_tokens=True) for sent in text_test])

In [36]:
max_len = 0
for sent in text_test_tokenized:
    if len(sent) > max_len:
        max_len = len(sent)
input_ids_test = np.array(
    [sent + [0]*(max_len-len(sent)) for sent in text_test_tokenized])
attention_masks_test = np.where(input_ids_test != 0, 1, 0)

In [37]:
test_inputs = torch.tensor(input_ids_test)

test_masks = torch.tensor(attention_masks_test)

In [38]:
test_inputs.shape

torch.Size([3263, 52])

In [39]:
test_masks.shape

torch.Size([3263, 52])

In [40]:
batch_size = 32

test_data = TensorDataset(test_inputs, test_masks)
test_dataloader = DataLoader(
    test_data, batch_size=batch_size)

Run forward pass

In [41]:
predictions = []
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
        
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    with torch.no_grad():        
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs[0]
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    pred_flat = np.argmax(logits, axis=1).flatten()
    predictions.append(pred_flat)

In [42]:
predictions_full = np.hstack([p for p in predictions])

In [43]:
df_test['target'] = predictions_full
df_out = df_test[['target']]
df_out.to_csv('submission_distilbert_v3.csv')