## Import

In [44]:
import wget
import os
import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForMaskedLM

print('Import is done.')

Import is done.


## Configuration

In [35]:
class Configurator:
    def __init__(self) -> None:
        self._params = {
            'url.dataset.train': 'https://github.com/RussianNLP/RuCoLA/blob/main/data/in_domain_train.csv?raw=true',
            'url.dataset.test': 'https://github.com/RussianNLP/RuCoLA/blob/main/data/in_domain_dev.csv?raw=true',
            
            'path.dataset.train': './train_dataset.csv',
            'path.dataset.test': './test_dataset.csv',
            
            'name.train': 'TRAIN',
            'name.test': 'TEST',
  
            'dataframe.train.names': ['id', 'sentence', 'acceptable', 'error_type', 'detailed_source'],
#             'dataframe.train.usecols': ['sentence', 'acceptable'],
            'dataframe.test.names': ['id', 'sentence', 'acceptable', 'error_type', 'detailed_source'],
#             'dataframe.test.usecols': ['sentence', 'acceptable'],
            
            'bert.train-size': 0.9,
            'bert.batch-size': 32
        }
    
    def __call__(self, *args, **kwargs):
        if len(args) == 0 or args[0] not in self._params:
            return None
        return self._params[args[0]]
    
    def check(self, *args):
        result = True
        absence_params = set()
        for arg in args:
            if isinstance(arg, str) and arg not in self._params:
                result = False
                absence_params.add(arg)
        message = 'Absence params: ' + ', '.join(absence_params)
        assert result, message
        
        
conf = Configurator()

print('Configuration block is done.')

Configuration block is done.


## Conditions

In [11]:
class Conditions:
    DEVICE_DEFINED = 'device_deviced'
    DATASET_DOWNLOADED = 'dataset-downloaded'
    DATASET_LOADED = 'dataset-loaded'
    DATASET_PREPARED = 'dataset-prepared'
    TOKENIZERS_MODELS_CREATED = 'tikenizer-models-created'
    # collect CONDITIONS automatically !!!
    CONDITIONS = [
        DATASET_DOWNLOADED,
        DATASET_LOADED,
        DATASET_PREPARED,
        TOKENIZERS_MODELS_CREATED,
        DEVICE_DEFINED
    ]
    
    def __init__(self) -> None:
        self._conditions = {}
        
    def set(self, *conditions):
        for condition in conditions:
            if condition in self.CONDITIONS:
                self._conditions[condition] = True
    
    def check(self, *conditions):
        result = True
        absence = set()
        for condition in conditions:
            if condition in self.CONDITIONS and condition not in self._conditions:
                result = False
                absence.add(condition)
        message = 'Absence conditions: ' + ', '.join(absence)
        assert result, message
    
    
conds = Conditions()

print('Conditions block is done.')

Conditions block is done.


## Define device

In [33]:
if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')
    device = torch.device('cuda')
else:
    print('No GPU available, using the GPU instead.')
    device = torch.device('cpu')    

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3080 Laptop GPU


## Models & tokenizers creation

In [12]:
conds.check(Conditions.DEVICE_DEFINED)

bert_tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
bert_model = AutoModelForMaskedLM.from_pretrained("ai-forever/ruBert-base")

conds.set(Conditions.TOKENIZERS_MODELS_CREATED)

print('Models & tokenizers creation is done.')

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Models & tokenizers creation is done.


## Downloading datasets on disk

In [13]:
# optimize checking  !!!
conf.check(
    'url.dataset.train',
    'path.dataset.train',
    'name.train',
    'url.dataset.test',
    'path.dataset.test',
    'name.test'
);
conds.check(Conditions.TOKENIZERS_MODELS_CREATED)

def load_dataset(url: str, path: str, name: str):
    if os.path.exists(path):
        print('Dataset "' + name + '" is already downloaded.')
    else:
        wget.download(url, path)
        print(' Dataset "' + name + '" is downloaded.')
        
load_dataset(conf('url.dataset.train'), conf('path.dataset.train'), conf('name.train'))
load_dataset(conf('url.dataset.test'), conf('path.dataset.test'), conf('name.test'))
conds.set(Conditions.DATASET_DOWNLOADED)

Dataset "TRAIN" is already downloaded.
Dataset "TEST" is already downloaded.


## Loading datasets from disk

In [49]:
conf.check(
    'path.dataset.train',
    'path.dataset.test',
    'dataframe.train.names',
    'dataframe.train.usecols',
    'dataframe.test.names',
    'dataframe.test.usecols'
)
conds.check(Conditions.DATASET_DOWNLOADED)

train_dataframe = pd.read_csv(
    conf('path.dataset.train'),
    names=conf('dataframe.train.names'),
    skip
#     dtype= {'acceptable': np.int32},
#     skipfooter=1,
#     engine='python'
#     usecols=conf('dataframe.train.usecols')
)

test_dataframe = pd.read_csv(
    conf('path.dataset.test'),
    names=conf('dataframe.test.names'),
#     usecols=conf('dataframe.test.usecols')
)
conds.set(Conditions.DATASET_LOADED)

print('Datasets are loaded from disk.')

Datasets are loaded from disk.


## [bert] Datasets preparation, create dataloaders

In [50]:
conds.check(Conditions.DATASET_LOADED)

train_sentences = train_dataframe.sentence.values[1:]
train_acceptables = train_dataframe.acceptable.values[1:]
test_sentences = test_dataframe.sentence.values[1:]
test_acceptables = test_dataframe.acceptable.values[1:]


def define_raw_max_length_by_bert(sentences, raw_max_length):
    for sentence in sentences:
        input_ids = bert_tokenizer.encode(sentence, add_special_tokens=True)
        raw_max_length = max(raw_max_length, len(input_ids))    
    return raw_max_length


raw_max_length = define_raw_max_length_by_bert(train_sentences, 0)
raw_max_length = define_raw_max_length_by_bert(test_sentences, raw_max_length)


def define_max_length(raw_max_length, threshold):
    return threshold if threshold >= raw_max_length else define_max_length(raw_max_length, threshold * 2)


max_length = define_max_length(raw_max_length, 1)

def create_bert_dataset(sentences, acceptables, max_length):
    input_ids = []
    attention_masks = []
    for sentence in sentences:
        encoded_dict = bert_tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_mask = torch.cat(attention_masks, dim=0)
    acceptables = torch.tensor(acceptables)
    
    return TensorDataset(input_ids, attention_masks, acceptables)

print(train_acceptables[0])
print(type(train_acceptables[0]))
print(type(train_acceptables))

# train_val_dataset = create_bert_dataset(train_sentences, train_acceptables, max_length)
# print(train_val_dataset)





#     print_header('create 90/10 train/validation split')
#     train_size = int(0.9 * len(dataset))
#     val_size = len(dataset) - train_size
#     print(f'training samples: {train_size}, validation samples: {val_size}')
#     train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

#     print_header('set batch size')
#     batch_size = 32
#     print(f'Batch size = {batch_size}')

#     print_header('Training & validation dataloaders creation')
#     train_dataloader = DataLoader(
#         train_dataset,
#         sampler=RandomSampler(train_dataset),
#         batch_size=batch_size
#     )
#     validation_dataloader = DataLoader(
#         val_dataset,
#         sampler=SequentialSampler(val_dataset),
#         batch_size=batch_size
#     )

#             'bert.train-size': 0.9,
#             'bert.batch-size': 32


# -----------------------------------------------

#     input_ids = []
#     attention_masks = []
#     for sentence in sentences:
#         encoded_dict = tokenizer.encode_plus(
#             sentence,
#             add_special_tokens=True,
#             max_length=64,
#             # pad_to_max_length=True,
#             padding='max_length',
#             return_attention_mask=True,
#             return_tensors='pt'
#         )
#         input_ids.append(encoded_dict['input_ids'])
#         attention_masks.append(encoded_dict['attention_mask'])

#     print_header('Convert the lists into tensors.')
#     input_ids = torch.cat(input_ids, dim=0)
#     attention_masks = torch.cat(attention_masks, dim=0)
#     labels = torch.tensor(labels)

#     print_header('Set the batch size.')
#     batch_size = 32
#     print(f'batch size: {batch_size}')

#     print_header('Create the DataLoader')
#     prediction_data = TensorDataset(input_ids, attention_masks, labels)
#     prediction_sampler = SequentialSampler(prediction_data)
#     prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)



# !!! conds.check(Conditions.DATASET_PREPARED)

1
<class 'str'>
<class 'numpy.ndarray'>
