# AI4Code Pytorch DistilBert Baseline

I used a lot of code from Kaggle's starter notebook here: https://www.kaggle.com/code/ryanholbrook/getting-started-with-ai4code

I replaced their model with a DistilBert model.

In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

BERT_PATH = "../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased"
data_dir = Path('../input/train_markdown_ranks')


In [2]:
data_path = '../input/train-markdown-ranks/train_markdown_ranks.fth'
df = pd.read_feather(data_path)
df.head()

In [11]:
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer

from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertModel, DistilBertTokenizer

from sklearn.model_selection import GroupShuffleSplit

MAX_LEN = 128
NVALID = 0.1


class MarkdownDataset(Dataset):
    def __init__(self, df, max_len=MAX_LEN):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        self.tokenizer = DistilBertTokenizer.from_pretrained(BERT_PATH, do_lower_case=True)

    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])

        return ids, mask, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]
    

class MarkdownDataModule(pl.LightningDataModule):
    def __init__(
            self, train_path: str = None, batch_size: int = 32,
            train_dat=None, val_dat=None
    ):
        super().__init__()

        self.train_path = train_path
        self.batch_size = batch_size

        self.train_dataset, self.val_dataset = train_dat, val_dat

    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            self._prepare_dataset()

        if stage == 'test' or stage is None:
            pass

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size, num_workers=1,
                          pin_memory=True, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size, num_workers=1,
                          pin_memory=True)

    def _prepare_dataset(self):
        if (self.train_dataset is not None) and (self.val_dataset is not None):
            return 
        
        df = pd.read_feather(self.train_path)

        splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)
        train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"]))
        train_df, val_df = df.loc[train_ind].reset_index(drop=True), df.loc[val_ind].reset_index(drop=True)

        train_ds = MarkdownDataset(train_df, max_len=MAX_LEN)
        val_ds = MarkdownDataset(val_df, max_len=MAX_LEN)

        self.train_dataset, self.val_dataset = train_ds, val_ds

In [12]:
import sys, os
import torch.nn.functional as F
import torch.nn as nn
import torch

from tqdm import tqdm


def read_data(data):
    return tuple(d for d in data[:-1]), data[-1]


def validate(model, val_loader):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(inputs[0].cuda(), inputs[1].cuda())

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    
    return np.concatenate(labels), np.concatenate(preds)


class MarkdownModelPl(LightningModule):
    def __init__(self):
        super(MarkdownModelPl, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained(BERT_PATH)
        self.top = nn.Linear(768, 1)
        
        self.criterion = torch.nn.MSELoss()
    
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = self.top(x[:, 0, :])
        return x

    def training_step(self, batch, batch_idx):
        inputs, target = self.__read_data(batch)
        pred = self(inputs[0], inputs[1])

        loss = self.criterion(pred, target)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=3e-4, betas=(0.9, 0.999), eps=1e-08)
        return optimizer

    @staticmethod
    def __read_data(data):
        return tuple(d for d in data[:-1]), data[-1]


In [13]:
MDM = MarkdownDataModule(data_path)
model = MarkdownModelPl()

In [14]:
trainer = Trainer(gpus=1, max_epochs=1)
trainer.fit(model, MDM)

In [15]:
test_path = '../input/train-markdown-ranks/test_dataset.fth'
test_df = pd.read_feather(test_path)
test_df.head()

In [16]:
test_df["pct_rank"] = 0
test_ds = MarkdownDataset(
    test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), max_len=MAX_LEN
)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, num_workers=1,
                          pin_memory=False, drop_last=False)


In [17]:
_, y_test = validate(model, test_loader)


In [18]:
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()


In [19]:
sub_df.to_csv("submission.csv", index=False)
