# Kaggle Submission

## Package Imports

In [1]:
# PyTorch imports:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, TensorDataset, SequentialSampler

# HuggingFace imports:
import transformers
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Data manipulation imports:
import numpy as np
import pandas as pd

## Input Pre-Processing

In [2]:
# Define input parameters:
MAX_LEN = 250
NUM_PRED = 1

In [3]:
# Processes each chunk of tokenized text so it can be 'understood' by BERT:
def process_chunks(id_chunk, mask_chunk, max_len, start_id=101, end_id=102):
  # Add start and stop IDs:
  id_chunk = torch.cat([torch.tensor([start_id]), id_chunk, torch.tensor([end_id])])
  mask_chunk = torch.cat([torch.ones(1), mask_chunk, torch.ones(1)])
  # Pad chunks so that they're all of the same length:
  if len(id_chunk) < max_len:
    pad_len = max_len - len(id_chunk)
    id_chunk = torch.cat([id_chunk, torch.zeros(pad_len)])
    mask_chunk = torch.cat([mask_chunk, torch.zeros(pad_len)])
  return (id_chunk, mask_chunk)

# Divides up each piece of text into chunks of size max_len then tokenizes them:
def tokenise_chunks(features, tokenizer, max_len):
  X_id, X_mask = [], []
  for text in features["excerpt"]:
    id_list, mask_list = [], []
    # Encode piece of text:
    tokens = tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
    # Split ID and attention mask into chunks of size (max_len-2) - note we need to add
    # a start and end token to these chunks:
    ids = tokens["input_ids"][0].split(max_len-2)
    masks = tokens["attention_mask"][0].split(max_len-2)
    # Add start and end token to each chunk:
    for id_chunk, mask_chunk in zip(ids, masks):
      id_chunk, mask_chunk = process_chunks(id_chunk, mask_chunk, max_len)
      id_list.append(id_chunk), mask_list.append(mask_chunk)
    X_id.append(torch.stack(id_list, dim=0))
    X_mask.append(torch.stack(mask_list, dim=0))
  # Pad list of IDs and Masks so that they're now both stored in a single tensor
  # of dimensions (batch size × num of chunks × num of mask/id values):
  X_id = pad_sequence(X_id, batch_first=True, padding_value=0)
  X_mask = pad_sequence(X_mask, batch_first=True, padding_value=0)
  X_id, X_mask = torch.as_tensor(X_id, dtype=torch.int64), torch.as_tensor(X_mask, dtype=torch.int64)
  return (X_id, X_mask)

In [4]:
# Custom PyTorch Dataset class
class ReadabilityData(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item_i = self.data[idx]
        dict_i = {"id": item_i[0],
                  "mask": item_i[1], 
                  "wt": item_i[2]}
        return dict_i

In [5]:
# For creating the test dataset:
#    - dataset output includes X_id, X_mask, and y
#    - X_id and X_mask are ([batch size*num of chunks] × num of mask/id values)
#    - y is ([batch size*num of chunks] × num of outputs)
def create_test_dataset(test_features, tokenizer, max_len):
  # Split text into chunks and tokenise those chunks:
  X_id, X_mask = tokenise_chunks(test_features, tokenizer, max_len)
  # Compute weightings for the sentence chunks:
  X_wts = torch.sum(X_mask, axis=2, keepdim=True)/torch.sum(X_mask, axis=(1,2), keepdim=True)
  X_wts = X_wts.reshape(X_mask.shape[0:-1])
  X_wts = torch.as_tensor(X_wts, dtype=torch.float32)
  # Place tensors into TensorDataset object:
  test_dataset = TensorDataset(X_id, X_mask, X_wts)
  # Create Dataset object:
  test_dataset = ReadabilityData(test_dataset)
  return test_dataset

## Define Bert Model

In [6]:
class BertModel():
    def __init__(self, load_dir, num_pred):
        self.num_pred = num_pred
        # Load fine-tuned Bert model:
        self.model = DistilBertForSequenceClassification.from_pretrained(load_dir)
        
    def predict(self, id, att_mask, wt):
        with torch.no_grad():
            if self.model.training:
              self.model.eval()
            # Compute input and output shapes:
            in_shape = (id.shape[0]*id.shape[1], id.shape[2])
            out_shape = id.shape[0:-1] + (self.num_pred,)
            id, att_mask = id.reshape(in_shape), att_mask.reshape(in_shape)
            # Make prediction with BERT model:
            logits = self.model(input_ids=id, attention_mask=att_mask)["logits"]
            # Convert output to NumPy array so that we can use nan_to_num:
            logits = logits.reshape(out_shape).numpy()
            # Take weighted-average of BERT predictions for each chunk of text:
            pred = np.einsum("ij,ijk->ik", wt, np.nan_to_num(logits))
            pred = pred.squeeze()
        return pred

## Load Pre-Trained Model

In [7]:
load_dir = "../input/finetuned-bert-readability/best_model"
bert_model = BertModel(load_dir, NUM_PRED)

## Make Predictions

In [8]:
# Load test data:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test_df = test_df.loc[:,["id", "excerpt"]]

# Tokenize the text in this test dataset:
tokenizer = DistilBertTokenizer.from_pretrained("../input/finetuned-bert-readability/tokenizer")
test_dataset = create_test_dataset(test_df, tokenizer, MAX_LEN)

# Drop text now that it's tokenized:
test_df.drop(columns="excerpt", inplace=True)

# Place tokenized dataset into Dataloader:
test_dataloader = DataLoader(test_dataset,
                             batch_size=32,
                             shuffle=False)

# Make predictions:
pred_list = np.array([])
for d in test_dataloader:
    pred = bert_model.predict(d["id"],
                              d["mask"],
                              d["wt"])
    pred_list = np.concatenate((pred_list, pred), axis=0)

# Add predictions to test dataframe:
test_df["target"] = np.array(pred_list)
# Save ids and predictions to csv file:
test_df.to_csv("submission.csv", index=False)

In [9]:
test_df

Unnamed: 0,id,target
0,c0f722661,-0.152462
1,f0953f0a5,0.08422
2,0df072751,-0.219362
3,04caf4e0c,-2.287382
4,0e63f8bea,-1.792509
5,12537fe78,-0.871888
6,965e592c0,0.273297
