## Import

In [95]:
import wget
import os
import torch
import pandas as pd

from transformers import AutoTokenizer, AutoModelForMaskedLM

print('Import is done.')

Import is done.


## Models & tokenizers creation

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
bert_model = AutoModelForMaskedLM.from_pretrained("ai-forever/ruBert-base")

Downloading (…)lve/main/config.json: 100%|████████████████████████████████████████████████████| 590/590 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)solve/main/vocab.txt: 100%|█████████████████████████████████████████| 1.78M/1.78M [00:02<00:00, 609kB/s]
Downloading pytorch_model.bin:   4%|██                                              | 31.5M/716M [00:44<15:43, 726kB/s]

## Configuration

In [87]:
class Configurator:
    def __init__(self) -> None:
        self._params = {
            'url.dataset.train': 'https://github.com/RussianNLP/RuCoLA/blob/main/data/in_domain_train.csv?raw=true',
            'url.dataset.test': 'https://github.com/RussianNLP/RuCoLA/blob/main/data/in_domain_dev.csv?raw=true',
            
            'path.dataset.train': './train_dataset.csv',
            'path.dataset.test': './test_dataset.csv',
            
            'name.train': 'TRAIN',
            'name.test': 'TEST',
  
            'dataframe.train.names': ['id', 'sentence', 'acceptable', 'error_type', 'detailed_source'],
            'dataframe.train.usecols': ['sentence', 'acceptable'],
            'dataframe.test.names': ['id', 'sentence', 'acceptable', 'error_type', 'detailed_source'],
            'dataframe.test.usecols': ['sentence', 'acceptable'],
        }
    
    def __call__(self, *args, **kwargs):
        if len(args) == 0 or args[0] not in self._params:
            return None
        return self._params[args[0]]
    
    def check(self, *args):
        result = True
        absence_params = set()
        for arg in args:
            if isinstance(arg, str) and arg not in self._params:
                result = False
                absence_params.add(arg)
        message = 'Absence params: ' + ', '.join(absence_params)
        assert result, message
        
        
conf = Configurator()

print('Configuration block is done.')

Configuration block is done.


## Conditions

In [88]:
class Conditions:
    DATASET_DOWNLOADED = 'dataset-downloaded'
    DATASET_LOADED = 'dataset-loaded'
    DATASET_PREPARED = 'dataset-prepared'
    # collect CONDITIONS automatically !!!
    CONDITIONS = [
        DATASET_DOWNLOADED,
        DATASET_LOADED,
        DATASET_PREPARED
    ]
    
    def __init__(self) -> None:
        self._conditions = {}
        
    def set(self, *conditions):
        for condition in conditions:
            if condition in self.CONDITIONS:
                self._conditions[condition] = True
    
    def check(self, *conditions):
        result = True
        absence = set()
        for condition in conditions:
            if condition in self.CONDITIONS and condition not in self._conditions:
                result = False
                absence.add(condition)
        message = 'Absence conditions: ' + ', '.join(absence)
        assert result, message
    
    
conds = Conditions()

print('Conditions block is done.')

Conditions block is done.


## Downloading datasets on disk

In [89]:
conf.check(
    'url.dataset.train',
    'path.dataset.train',
    'name.train',
    'url.dataset.test',
    'path.dataset.test',
    'name.test'
);

def load_dataset(url: str, path: str, name: str):
    if os.path.exists(path):
        print('Dataset "' + name + '" is already downloaded.')
    else:
        wget.download(url, path)
        print(' Dataset "' + name + '" is downloaded.')
        
load_dataset(conf('url.dataset.train'), conf('path.dataset.train'), conf('name.train'))
load_dataset(conf('url.dataset.test'), conf('path.dataset.test'), conf('name.test'))
conds.set(Conditions.DATASET_DOWNLOADED)

Dataset "TRAIN" is already downloaded.
Dataset "TEST" is already downloaded.


## Loading datasets from disk

In [90]:
conf.check(
    'path.dataset.train',
    'path.dataset.test',
    'dataframe.train.names',
    'dataframe.train.usecols',
    'dataframe.test.names',
    'dataframe.test.usecols'
)
conds.check(Conditions.DATASET_DOWNLOADED)

train_dataframe = pd.read_csv(
    conf('path.dataset.train'),
    names=conf('dataframe.train.names'),
    usecols=conf('dataframe.train.usecols')
)

test_dataframe = pd.read_csv(
    conf('path.dataset.test'),
    names=conf('dataframe.test.names'),
    usecols=conf('dataframe.test.usecols')
)
conds.set(Conditions.DATASET_LOADED)

print('Datasets are loaded from disk.')

Datasets are loaded from disk.


## Datasets preparation

In [91]:
conds.check(Conditions.DATASET_LOADED)

# !!! conds.check(Conditions.DATASET_PREPARED)