**Kaggle CommonLit Prize Challenge**

### **Install and load libraries**

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 39.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 53.3 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 33.9 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 45.6 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninst

In [None]:
from google.colab import drive

import pandas as pd
import numpy as np
import random
from datetime import datetime

import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import DistilBertForSequenceClassification, AdamW, BertConfig
from transformers import DistilBertTokenizer
from transformers import get_linear_schedule_with_warmup

### **Load training and test data sets**

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Load the training dataset into a pandas dataframe
trng_dataset = pd.read_csv("/content/drive/MyDrive/Data/Kaggle/CommonLit/train.csv", delimiter=',')
print('Number of training sentences =', trng_dataset.shape[0])
trng_dataset.head()

Number of training sentences = 2834


Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [None]:
trng_dataset.shape

(2834, 6)

In [None]:
# Get the lists of sentences and their labels.
trng_sentences = trng_dataset.excerpt.values
trng_labels = trng_dataset.target.values
trng_labels = trng_labels.astype('float32')

In [None]:
trng_labels.dtype

dtype('float32')

### **Configure DistilBERT model**

In [None]:
# Identify the GPU to use for model fine-tuning and prediction
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




In [None]:
# Set the maximum length for the tokens
max_len = 0

for sent in trng_sentences:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  314


In [None]:
# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in trng_sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 100,           # Pad & truncate all sentences.
                        truncation = True,
                        padding = 'max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

In [None]:
# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(trng_labels, dtype=torch.float) # Ensure that the labels are of type float

# Print sentence 0, now as a list of IDs.
print('Original: ', trng_sentences[0])
print('Token IDs:', input_ids[0])

Original:  When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.
The floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches.
At each end of the room, on the wall, hung a beautiful bear-skin rug.
These rugs were for prizes, one for the girls and one for the boys. And this was the game.
The girls were gathered at one end of the room and the boys at the other, and one end was called the North Pole, and the other the South Pole. Each player was given a small flag which they were to plant on reaching the Pole.
This would have been an easy matter, but each traveller was obliged to wear snowsho

In [None]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 75:25 train-validation split.
train_size = int(0.75 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

2,125 training samples
  709 validation samples


In [None]:
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# Training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            # sampler = RandomSampler(train_dataset), # Select batches randomly
            shuffle = True,
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",      # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 1,                 # Regression task
    output_attentions = False,      # Whether the model returns attentions weights.
    output_hidden_states = False,   # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [None]:
# Step 1: lr = 2e-5
# optimizer = AdamW(model.parameters(),
#                   lr = 2e-5, # args.learning_rate
#                   eps = 1e-8 # args.adam_epsilon
#                 )

# Step 2: lr = 1e-6
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # new learning_rate
                  eps = 1e-8 # args.adam_epsilon
                )

In [None]:
# Number of training epochs
epochs = 4
# epochs = 8

# Total number of training steps is [number of batches] x [number of epochs]. 
# total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# scheduler = get_linear_schedule_with_warmup(optimizer, 
#                                             num_warmup_steps = 0,
#                                             num_training_steps = total_steps)

In [None]:
# Calculate MSE
def get_mse(preds, labels):
    pred_flat = preds.flatten()
    labels_flat = labels.flatten()
    return np.sum((pred_flat - labels_flat)**2)/len(labels_flat)

In [None]:
# Set the seed value to make the downstream results reproducible. Since the selection of batches is random, 
# final results from one complete run to another would still vary
# loss == MSE so validation loss (calculated directly by BERT) and validation accuracy (calculated using "get_MSE" function) are identical
seed_val = 20

random.seed(seed_val)
np.random.seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
t_start = datetime.now()

# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t_epoch_trng = datetime.now()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            elapsed = (datetime.now() - t_epoch_trng).total_seconds()
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        
        trng_output = model(b_input_ids,  
                             attention_mask=b_input_mask, 
                             labels=b_labels)
                
        total_train_loss += trng_output.loss.item()

        # Perform a backward pass to calculate the gradients.
        trng_output.loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        # scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = datetime.now() - t_epoch_trng

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: ", training_time.total_seconds())
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t_epoch_validation = datetime.now()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            # validation_output = model(b_input_ids, 
            #                        attention_mask=b_input_mask,
            #                        labels=b_labels)

            # Labels are not provided
            validation_output = model(b_input_ids, 
                                   attention_mask=b_input_mask)

        # total_eval_loss += validation_output.loss.item()

        logits = validation_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += get_mse(logits, label_ids)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = datetime.now() - t_epoch_validation
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: ", validation_time.total_seconds())

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("Total training took: ", (datetime.now() - t_start).total_seconds())


Training...
  Batch    40  of     67.    Elapsed: 10.48063.

  Average training loss: 0.61
  Training epcoh took:  17.291608

Running Validation...
  Accuracy: 0.38
  Validation Loss: 0.00
  Validation took:  1.998607

Training...
  Batch    40  of     67.    Elapsed: 10.496143.

  Average training loss: 0.26
  Training epcoh took:  17.435099

Running Validation...
  Accuracy: 0.35
  Validation Loss: 0.00
  Validation took:  2.034093

Training...
  Batch    40  of     67.    Elapsed: 10.623338.

  Average training loss: 0.13
  Training epcoh took:  17.620707

Running Validation...
  Accuracy: 0.33
  Validation Loss: 0.00
  Validation took:  2.055256

Training...
  Batch    40  of     67.    Elapsed: 10.800106.

  Average training loss: 0.07
  Training epcoh took:  17.989932

Running Validation...
  Accuracy: 0.41
  Validation Loss: 0.00
  Validation took:  2.1083
Total training took:  78.536898


In [None]:
# training_stats

In [None]:
valid_accuracy = [x['Valid. Accur.'] for x in training_stats]
valid_accuracy

[0.3810140174368154,
 0.3479061010091201,
 0.3297830996306046,
 0.4117939455353695]

In [None]:
# RMSE
np.sqrt(np.min(valid_accuracy))

0.5742674460829245