In [None]:
import pandas as pd

import torch
import pytorch_lightning as pl

from transformers import DistilBertModel, DistilBertTokenizer
from pytorch_lightning.loggers import WandbLogger

from datasets import Dataset
from torch.utils.data import DataLoader

from sklearn.model_selection import GroupShuffleSplit

class MarkdownDataModule(pl.LightningDataModule):
    def __init__(self, train_path: str = None, batch_size: int = 32,
                 train_dat=None, val_dat=None, model = "distilbert-base-uncased"):
        super().__init__()

        self.train_path = train_path
        self.batch_size = batch_size
        self.validation_size = 0.1
        self.padding = 128

        self.tokenizer = DistilBertTokenizer.from_pretrained(model, do_lower_case=True)

        self.train_dataset, self.val_dataset = train_dat, val_dat

    def _preprocess_dataset(self, df):

        dataset = Dataset.from_pandas(df)

        def process_batch(batch):
            tokenized = self.tokenizer(
                batch['source'],
                padding='max_length',
                truncation=True,
                max_length=self.padding
            )
            return tokenized

        dataset = dataset.map(
            lambda batch: process_batch(batch),
            batched=True, batch_size=self.batch_size,
        )

        dataset.set_format('pt', ['input_ids', 'attention_mask', 'score'])

        return dataset

    def _split_if_ancestors(self, df):

        splitter = GroupShuffleSplit(n_splits=1, test_size=self.validation_size, random_state=0)
        train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"]))
        train_df, val_df = df.loc[train_ind].reset_index(drop=True), df.loc[val_ind].reset_index(drop=True)

        return train_df, val_df


    def _prepare_dataset(self):
        if (self.train_dataset is not None) and (self.val_dataset is not None):
            return

        df = pd.read_feather(self.train_path)
        df = df.rename(columns = {'pct_rank':'score'})
        # Add data cleaninng step
        train_df, val_df = self._split_if_ancestors(df)
        print('prepearing train data')
        self.train_dataset = self._preprocess_dataset(train_df)
        print('prepearing test data')
        self.val_dataset = self._preprocess_dataset(val_df)


    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            self._prepare_dataset()

        if stage == 'test' or stage is None:
            pass

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size, num_workers=4,
                          pin_memory=True, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size, num_workers=4,
                          pin_memory=True)


class MarkdownModelPl(pl.LightningModule):
    def __init__(self, model = "distilbert-base-uncased"):
        super(MarkdownModelPl, self).__init__()

        self.distill_bert = DistilBertModel.from_pretrained(model, return_dict=True)
        self.dense = torch.nn.Linear(768, 1)
        self.loss = torch.nn.MSELoss()
        self.activation = torch.nn.LeakyReLU()

    def forward(self, input_ids, attention_mask, score):

        embeddings = self.distill_bert(input_ids, attention_mask)['last_hidden_state']
        embeddings = self.activation(embeddings)
        preds = self.dense(embeddings[:, 0, :]) # why are you taking embeding of first token, maybe mean?

        return preds

    def training_step(self, batch, batch_idx):

        preds = self.forward(**batch).reshape(-1)
        loss = self.loss(preds, batch['score'])
        self.log('train_batch_loss', loss)
        self.log('train_RMSE',1)

        return loss

    def validation_step(self, batch, batch_idx):
        preds = self.forward(**batch).reshape(-1)
        loss = self.loss(preds, batch['score'])

        self.log('val_batch_loss', loss)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=3e-4, betas=(0.9, 0.999), eps=1e-08)
        return optimizer

In [None]:
model = "distilbert-base-uncased"
data_path = 'data/ranks.fth'
# df = pd.read_feather(data_path)
MDM = MarkdownDataModule(data_path)
model = MarkdownModelPl()

wandb_logger = WandbLogger(project="JupyterKaggleBaseline")

trainer = pl.Trainer(
    gpus = 1,
    max_epochs=20,
    logger=wandb_logger,
    enable_progress_bar=False,
    log_every_n_steps=20,
    accumulate_grad_batches=4,
)
trainer.fit(model, MDM)


# Creating submission

In [None]:
test_path = '../input/train-markdown-ranks/test_dataset.fth'
test_df = pd.read_feather(test_path)
test_df.head()

In [None]:
test_df["pct_rank"] = 0
test_ds = MarkdownDataset(
    test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), max_len=MAX_LEN
)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, num_workers=1,
                          pin_memory=False, drop_last=False)


In [None]:
_, y_test = validate(model, test_loader)


In [None]:
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()


In [None]:
sub_df.to_csv("submission.csv", index=False)

