In [19]:
import pandas as pd
from data_managment.samplers import MDSampler
import torch
import pytorch_lightning as pl

from transformers import DistilBertModel, DistilBertTokenizer
from pytorch_lightning.loggers import WandbLogger

from datasets import Dataset
from torch.utils.data import DataLoader

from sklearn.model_selection import GroupShuffleSplit

class MarkdownDataModule(pl.LightningDataModule):
    def __init__(self, train_path: str = None, test_path: str =None, batch_size: int = 32, resample = False,
                 train_dat=None, val_dat=None, test_dat=None, model = "distilbert-base-uncased"):
        super().__init__()

        self.test_path = test_path
        self.train_path = train_path
        self.resample = resample

        self.batch_size = batch_size
        self.validation_size = 0.1
        self.padding = 128

        self.tokenizer = DistilBertTokenizer.from_pretrained(model, do_lower_case=True)

        self.train_dataset, self.val_dataset, test_dataset = train_dat, val_dat, test_dat

    def _read_train_dataset(self):

        df = pd.read_feather(self.train_path)

        if self.resample:
            sampler = MDSampler(df)
            df = sampler.sample_ranks(save = False)

        df = df.rename(columns = {'pct_rank':'score'})
        train_df, val_df = self._split_if_ancestors(df)

        train_dataset = Dataset.from_pandas(train_df)
        validation_dataset = Dataset.from_pandas(val_df)

        return train_dataset, validation_dataset

    def _read_test_dataset(self):
        df = pd.read_feather(self.test_path)

        sampler = MDSampler(df)
        df = sampler.sample_ranks(save = False)
        df = df.rename(columns = {'pct_rank':'score'})
        test_dataset = Dataset.from_pandas(df)
        return test_dataset


    def _preprocess_dataset(self, dataset):


        def process_batch(batch):
            tokenized = self.tokenizer(
                batch['source'],
                padding='max_length',
                truncation=True,
                max_length=self.padding
            )
            return tokenized

        dataset = dataset.map(
            lambda batch: process_batch(batch),
            batched=True, batch_size=self.batch_size,
        )

        dataset.set_format('pt', ['input_ids', 'attention_mask', 'score'])

        return dataset

    def _split_if_ancestors(self, df):

        splitter = GroupShuffleSplit(n_splits=1, test_size=self.validation_size, random_state=0)
        train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"]))
        train_df, val_df = df.loc[train_ind].reset_index(drop=True), df.loc[val_ind].reset_index(drop=True)

        return train_df, val_df

    def prepare_data(self):

        if (self.train_dataset is not None) and (self.val_dataset is not None) and (self.test_dataset is not None):
            return
        train, val = self._read_train_dataset()
        test = self._read_test_dataset()
        print('preparing train data')
        self.train_dataset = self._preprocess_dataset(train)
        print('preparing validation data')
        self.val_dataset = self._preprocess_dataset(val)
        print('preparing test data')
        self.test_dataset = self._preprocess_dataset(test)


    # def setup(self, stage=None):
    #     if stage == 'fit' or stage is None:
    #         pass
    #
    #     if stage == 'test' or stage is None:
    #         pass

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size, num_workers=4,
                          pin_memory=True, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size, num_workers=4,
                          pin_memory=True)



In [25]:
train = '../../data/train_dataset.fth'
test = '../../data/test_dataset.fth'

In [28]:
MDM = MarkdownDataModule(train_path=train, test_dat=test, resample=True)

In [None]:
MDM.prepare_data()

In [None]:
from rank_model import MarkdownModelPl

MDM = MarkdownDataModule('data/ranks.fth')
model = MarkdownModelPl()
wandb_logger = WandbLogger(project="JupyterKaggleBaseline")

trainer = pl.Trainer(
    gpus = 1,
    max_epochs=20,
    logger=wandb_logger,
    enable_progress_bar=False,
    log_every_n_steps=20,
    accumulate_grad_batches=4,
)
trainer.fit(model, MDM)


# Creating submission

In [None]:
test_path: str = '../input/train-markdown-ranks/test_dataset.fth'
test_df = pd.read_feather(test_path)
test_df.head()

In [None]:
test_df["pct_rank"] = 0
test_ds = MarkdownDataset(
    test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), max_len=MAX_LEN
)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, num_workers=1,
                          pin_memory=False, drop_last=False)


In [None]:
_, y_test = validate(model, test_loader)


In [None]:
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()


In [None]:
sub_df.to_csv("submission.csv", index=False)



In [5]:
False | False

False