In [None]:
import sys
!{sys.executable} -m pip install pandas nb_black pytorch-lightning torch tokenizers transformers

In [None]:
import sys
import inspect
import pathlib
from pathlib import Path
from collections import Iterable

import numpy as np
import pandas as pd

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset

import tokenizers
from transformers import AutoModel, AutoTokenizer

import pytorch_lightning as pl

from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit

#%load_ext lab_black

pd.options.display.max_colwidth = 250
pd.options.display.max_columns = 250

In [None]:
from google.colab import drive

drive.mount('/content/drive')

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)
(DATA_DIR / "raw").mkdir(exist_ok=True)

!cp drive/MyDrive/comp_data/openhack/kanunum-nlp-doc-analysis-dataset.csv data/raw

In [None]:

from inspect import signature

class KanunumClassificationDataset(Dataset):
    MAPPINGS = {'cumhurbaşkanlığı kararnamesi': 0,
                'genelge': 1,
                'kanun': 2,
                'kanun hükmünde kararname': 3,
                'komisyon raporu': 4,
                'resmi gazete': 5,
                'tebliğ': 6,
                'tüzük': 7,
                'yönetmelik': 8,
                'özelge': 9}


    def __init__(
        self,
        df,
        text_field="baslik",
        text_pair=None,
        target_field="kategori",
        pretrained_model_name_or_path="dbmdz/convbert-base-turkish-mc4-uncased",
        encode_pair=True,
        tokenizer_init_kwargs=None,
        tokenizer_encode_kwargs=None,
        return_params=None,
        is_inference=True,
    ):
        
        assert isinstance(text_field, str) and text_field in df.columns

        self.labels = None
        self.num_class = None

        if not is_inference:
            assert target_field in df.columns

            df[target_field] = df[target_field].str.lower().str.strip()
            
            self.num_class = len(self.MAPPINGS.values())
            
            labels = np.zeros(shape=(df.shape[0], self.num_class))
            for row, col in df[target_field].map(self.MAPPINGS).to_frame("col").reset_index().values:
                labels[row, col] = 1

            self.labels = pd.DataFrame(labels)
            

        is_pair_encoded = False
        tokenization_base_kwargs = {
            "return_attention_mask": True,
            "return_token_type_ids": True,
        }

        if not tokenizer_encode_kwargs:
            tokenizer_encode_kwargs = {}

        assert isinstance(tokenizer_encode_kwargs, dict)
        tokenizer_encode_kwargs.update(tokenization_base_kwargs)

        self.return_params = ["input_ids", "attention_mask", "token_type_ids"]

        if encode_pair and text_pair:
            assert isinstance(text_pair, str) and text_pair in df.columns
            is_pair_encoded = True
            # self.text_field2 = text_pair

        self.is_pair_encoded = is_pair_encoded
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)

        encoding_params = signature(self.tokenizer.encode_plus).parameters.keys()

        self.tokenizer_encode_kwargs = {
            key: val
            for key, val in tokenizer_encode_kwargs.items()
            if key in encoding_params
        }
        # self.text_field1 = text_field

        self.df = df.loc[
            :, [text_field] + ([text_pair] if self.is_pair_encoded else [])
        ].reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        tokenization_info = self.tokenizer(
            *self.df.loc[idx, :].values, **self.tokenizer_encode_kwargs
        )

        if isinstance(self.labels, pd.DataFrame):
            label = self.labels.loc[idx, :].values
            label = torch.tensor(label, dtype=torch.float32)

            return {
                param: torch.tensor(tokenization_info[param], dtype=torch.long)
                for param in self.return_params
            }, label

        return {
                param: torch.tensor(tokenization_info[param], dtype=torch.long)
                for param in self.return_params
            }