In [None]:
import sys
!{sys.executable} -m pip install pandas nb_black pytorch-lightning torch tokenizers transformers

In [None]:
import sys
import inspect
import pathlib
from pathlib import Path
from collections import Iterable

import numpy as np
import pandas as pd

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset

import tokenizers
from transformers import AutoModel, AutoTokenizer

import pytorch_lightning as pl

from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit

#%load_ext lab_black

pd.options.display.max_colwidth = 250
pd.options.display.max_columns = 250

In [None]:

from google.colab import drive

drive.mount('/content/drive')

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)
(DATA_DIR / "raw").mkdir(exist_ok=True)

!cp drive/MyDrive/comp_data/openhack/kanunum-nlp-doc-analysis-dataset.csv data/raw

In [None]:


from inspect import signature

class KanunumClassificationDataset(Dataset):
    MAPPINGS = {'cumhurbaşkanlığı kararnamesi': 0,
                'genelge': 1,
                'kanun': 2,
                'kanun hükmünde kararname': 3,
                'komisyon raporu': 4,
                'resmi gazete': 5,
                'tebliğ': 6,
                'tüzük': 7,
                'yönetmelik': 8,
                'özelge': 9}


    def __init__(
        self,
        df,
        text_field="baslik",
        text_pair=None,
        target_field="kategori",
        pretrained_model_name_or_path="dbmdz/convbert-base-turkish-mc4-uncased",
        encode_pair=True,
        tokenizer_init_kwargs=None,
        tokenizer_encode_kwargs=None,
        return_params=None,
        is_inference=True,
    ):
        
        assert isinstance(text_field, str) and text_field in df.columns

        self.labels = None
        self.num_class = None

        if not is_inference:
            assert target_field in df.columns

            df[target_field] = df[target_field].str.lower().str.strip()
            
            self.num_class = len(self.MAPPINGS.values())
            
            labels = np.zeros(shape=(df.shape[0], self.num_class))
            for row, col in df[target_field].map(self.MAPPINGS).to_frame("col").reset_index().values:
                labels[row, col] = 1

            self.labels = pd.DataFrame(labels)
            

        is_pair_encoded = False
        tokenization_base_kwargs = {
            "return_attention_mask": True,
            "return_token_type_ids": True,
        }

        if not tokenizer_encode_kwargs:
            tokenizer_encode_kwargs = {}

        assert isinstance(tokenizer_encode_kwargs, dict)
        tokenizer_encode_kwargs.update(tokenization_base_kwargs)

        self.return_params = ["input_ids", "attention_mask", "token_type_ids"]

        if encode_pair and text_pair:
            assert isinstance(text_pair, str) and text_pair in df.columns
            is_pair_encoded = True
            # self.text_field2 = text_pair

        self.is_pair_encoded = is_pair_encoded
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)

        encoding_params = signature(self.tokenizer.encode_plus).parameters.keys()

        self.tokenizer_encode_kwargs = {
            key: val
            for key, val in tokenizer_encode_kwargs.items()
            if key in encoding_params
        }
        # self.text_field1 = text_field

        self.df = df.loc[
            :, [text_field] + ([text_pair] if self.is_pair_encoded else [])
        ].reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        tokenization_info = self.tokenizer(
            *self.df.loc[idx, :].values, **self.tokenizer_encode_kwargs
        )

        if isinstance(self.labels, pd.DataFrame):
            label = self.labels.loc[idx, :].values
            label = torch.tensor(label, dtype=torch.float32)

            return {
                param: torch.tensor(tokenization_info[param], dtype=torch.long)
                for param in self.return_params
            }, label

        return {
                param: torch.tensor(tokenization_info[param], dtype=torch.long)
                for param in self.return_params
            }

In [None]:

class KanunumDataModule(pl.LightningDataModule):
    DATA_FIELDS = ['id', 'kategori', 'baslik', 'rega_no', 'mukerrer_no', 'rega_tarihi',
                  'kurum', 'mevzuat_no', 'belge_sayi', 'mevzuat_tarihi', 'donem',
                  'sira_no', 'madde_sayisi', 'data_text', 'url', 'kanunum_url']

    def __init__(self, 
                 data_dir: pathlib.Path,
                 data_prefix="kanunum-nlp-doc-analysis*.csv",
                 label="kategori",
                 text_field='baslik',
                 train_indices=None,
                 test_indices=None,
                 split_kwargs=None,
                 test_split_kwargs=None,
                 pretrained_model_name_or_path='dbmdz/convbert-base-turkish-mc4-uncased',
                 text_pair=None,   
                 encode_pair=True, 
                 tokenizer_init_kwargs=None, 
                 tokenizer_encode_kwargs=None, 
                 return_params=None, 
                 batch_size=256,
                 num_workers=0,
                 drop_last=False,
                 data_loader_kwargs=None,
                 errors="raise"
                 ):
      
        assert errors in ("raise", "ignore")
        assert not all([test_split_kwargs is not None, test_indices is not None])

        if not split_kwargs:
            split_kwargs = {}

        if not data_loader_kwargs:
            data_loader_kwargs = {}

        assert isinstance(split_kwargs, dict)

        super().__init__()

        self.data_dir = data_dir
        self.data_prefix = data_prefix
        self.dataset_kwargs = {"pretrained_model_name_or_path": pretrained_model_name_or_path,
                               "text_field": text_field,
                               "text_pair": text_pair,
                               "encode_pair": encode_pair,
                               "tokenizer_init_kwargs": tokenizer_init_kwargs,
                               "tokenizer_encode_kwargs": tokenizer_encode_kwargs,
                               "return_params": return_params,
                                }

        self.data_loader_kwargs = {**{"drop_last": drop_last,
                                      "num_workers": num_workers,
                                      "batch_size": batch_size,
                                      },
                                      **data_loader_kwargs
                                   }
        self.label = label
        
        self.train_indices = train_indices
        self.test_indices = test_indices

        self.split_kwargs = split_kwargs
        self.test_split_kwargs = test_split_kwargs

        self.include_test_split = any([arg is not None for arg in [train_indices, test_indices]]) or test_split_kwargs
        self.errors = errors
        

    def prepare_data(self):
        self.df = self.__class__._read_and_merge(self.data_dir, self.data_prefix)

    def setup(self, stage = None):
        self.prepare_data()

        if stage == "fit" or stage is None:
            assert self.label in self.df.columns

            (train, val), _ = self.__class__._construct_test_indices(self.df, 
                                                                      label=self.label, 
                                                                      test_split_kwargs=self.test_split_kwargs,
                                                                      split_kwargs=self.split_kwargs, 
                                                                      include_test_split=self.include_test_split,
                                                                      train_indices = self.train_indices,
                                                                      test_indices = self.test_indices,
                                                                      errors=self.errors)
            
            self.train = KanunumClassificationDataset(train, **{**self.dataset_kwargs, **{"is_inference": False}})
            self.val = KanunumClassificationDataset(val, **{**self.dataset_kwargs, **{"is_inference": False}})

        if stage in ("test", "predict") or stage is None:
            assert self.include_test_split
            _, test_df = self.__class__._construct_test_indices(self.df, 
                                                                label=self.label, 
                                                                test_split_kwargs=self.test_split_kwargs,
                                                                split_kwargs=self.split_kwargs, 
                                                                train_indices = self.train_indices,
                                                                test_indices = self.test_indices,
                                                                include_test_split=self.include_test_split,
                                                                errors=self.errors)

            self.test = KanunumClassificationDataset(test_df, **{**self.dataset_kwargs, **{"is_inference": True}})

    def train_dataloader(self):
        return DataLoader(self.train, **self.data_loader_kwargs)

    def val_dataloader(self):
        return DataLoader(self.val, **self.data_loader_kwargs)

    def test_dataloader(self):
        return DataLoader(self.test, **self.data_loader_kwargs)

    def predict_dataloader(self):
        return DataLoader(self.test, **self.data_loader_kwargs)

        
    @classmethod
    def _construct_fold_indices(cls, 
                                df,
                                label,  
                                split_signature,
                                split_kwargs,
                                include_test_split=True,
                                errors="raise"
                                ):
        
      if errors == "ignore":    
        signature_args = inspect.signature(split_signature).parameters.keys()
        split_kwargs = {key:val for key, val in split_kwargs.items() if key in signature_args}

      cv = split_signature(**split_kwargs)

      if include_test_split:
        _, test_indices = train_test_split(df.index, test_size=test_size, random_state=random_state, shuffle=shuffle, stratify=df[label])
        test_df = df.loc[test_indices].reset_index(drop=True)
        df = df.loc[df.index.difference(test_indices)].reset_index(drop=True)

      index_id_mappings = dict(df.reset_index().loc[:, ["index", "id"]].values)

      fold_indices = pd.concat([pd.concat([
                  pd.DataFrame({"fold": fold_idx, "indices": train_indices, "is_train": True}),
                  pd.DataFrame({"fold": fold_idx, "indices": test_indices, "is_train": False}),
                  ]
                )
      for fold_idx, (train_indices, test_indices) in enumerate(list(cv.split(df.index, df[label])))])
      fold_indices["id"] = fold_indices["indices"].map(index_id_mappings)

      return fold_indices, test_df

    
    @classmethod
    def _construct_test_indices(cls,
                                df,
                                label,
                                test_split_kwargs=None,
                                split_kwargs=None,
                                train_indices=None,
                                test_indices=None,
                                include_test_split=False,
                                errors="raise",
                                #split_signature=None,
                                ):
      
        from sklearn.model_selection import train_test_split
      
        is_train_idx_provided, is_test_idx_provided = [arg is not None for arg in [train_indices, test_indices]]
        is_indices_pre_provided = any([is_train_idx_provided, is_test_idx_provided])

        test_df = None

        if not test_split_kwargs:
            test_split_kwargs = {}

        if not split_kwargs:
            split_kwargs = {}
        

        if include_test_split:
            if not is_test_idx_provided:
              _, test_indices = train_test_split(df.index, stratify=df[label], **test_split_kwargs)
            
            test_df = df.loc[test_indices]
            df = df.loc[df.index.difference(test_indices)].reset_index(drop=True)

        if is_indices_pre_provided:
            train_indices = train_indices if is_train_idx_provided else df.index.difference(test_indices)
            
        else:
            split_signature = train_test_split
        
            if errors == "ignore":    
                signature_args = inspect.signature(split_signature).parameters.keys()
                split_kwargs = {key:val for key, val in split_kwargs.items() if key in signature_args}

            train_indices, _ = split_signature(df.index, stratify=df[label], **split_kwargs)

        train = df.loc[train_indices]
        val = df.loc[~df.index.isin(train_indices)].reset_index(drop=True)
        train = train.reset_index(drop=True)

        return (train, val), test_df

    @classmethod
    def _read_and_merge(cls,
                        path: pathlib.Path,
                        glob: str):
        res = []

        assert isinstance(glob, str)
        assert glob.endswith("csv") 

        for path in path.glob(glob):
            df = pd.read_csv(path)
            df.columns = df.columns.str.lower()
            if not len(df.columns.intersection(cls.DATA_FIELDS)) >= len(cls.DATA_FIELDS):
              continue
            
            res.append(df.loc[:, cls.DATA_FIELDS])

        res = pd.concat(res)

        return res