## Import

In [1]:
import random
import torch
import os
import time
import datetime

import numpy as np
import pandas as pd
from IPython.display import HTML

from torch.optim import AdamW
from torch.utils.data import Dataset as TorchDataset
from torch.utils.data import (
    TensorDataset,
    DataLoader,
    RandomSampler,
    SequentialSampler
)
from transformers import (
    get_linear_schedule_with_warmup,
    T5Tokenizer,
    T5ForConditionalGeneration
)

print('Import is done.')

  from .autonotebook import tqdm as notebook_tqdm


Import is done.


## Config

In [2]:
RANDOM_SEED = 42

TRAIN_KEY = 'train'
TEST_KEY = 'test'
VAL_KEY = 'val'

DATA_DIRECTORY = '.\\data'
DANETQA_INPUT = os.path.join(DATA_DIRECTORY, 'danetqa_paths.json')

PREPARED_BERT_PATH = 'ai-forever/ruBert-base'
PREPARED_T5_BASE_PATH = 'ai-forever/ruT5-base'

LR = 2e-5
EPS = 1e-8
TRAIN_FRAC = 0.9
VAL_FRAC = 0.9
# BATCH_SIZE = 16
BATCH_SIZE = 8
EPOCHS = 1

SENTENCE_COLS = ['question', 'passage']
USE_RAW_MAX_LENGTH = True

print('Constants are initialized.')

Constants are initialized.


## Set random seed

In [3]:
random.seed(RANDOM_SEED) # check needness
np.random.seed(RANDOM_SEED) # check needness
torch.manual_seed(RANDOM_SEED) # check needness
torch.cuda.manual_seed(RANDOM_SEED) # check needness

print('Random seed is set.')

Random seed is set.


## Define device

In [4]:
if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')
    device = torch.device('cuda')
else:
    print('No GPU available, using the GPU instead.')
    device = torch.device('cpu')
    
print('Device is defined.')

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3080 Laptop GPU
Device is defined.


## Model & tokenizer & optimizer creation

In [5]:
tokenizer = T5Tokenizer.from_pretrained(PREPARED_T5_BASE_PATH, use_fast=False)
model = T5ForConditionalGeneration.from_pretrained(PREPARED_T5_BASE_PATH)
optimizer = AdamW(model.parameters(), lr=LR, eps=EPS)


print('Model, tokenizer, optimizer are created.')

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Model, tokenizer, optimizer are created.


## Loading & preparation of dataframes

In [6]:
paths_df = pd.read_json(DANETQA_INPUT, lines=True)

train_file_path = paths_df.get(TRAIN_KEY).values[0]
test_file_path = paths_df.get(TEST_KEY).values[0]
val_file_path = paths_df.get(VAL_KEY).values[0]

original_train_df = pd.read_json(train_file_path, lines=True)
original_val_df = pd.read_json(val_file_path, lines=True)
original_test_df = pd.read_json(test_file_path, lines=True)

output_df = pd.DataFrame(
    [
        [TRAIN_KEY, train_file_path, len(original_train_df)],
        [TEST_KEY, test_file_path, len(original_val_df)],
        [VAL_KEY, val_file_path, len(original_test_df)]
    ],
    columns=['Key', 'Path', 'Size']
)
print('Original datasets:')
display(HTML(output_df.to_html(index=False)))

def enrich_df(df) -> None:
    df['sentence'] = df[SENTENCE_COLS].apply(lambda row: ''.join(row).lower(), axis=1)
    df['l'] = df[['label']].apply(lambda label: 1 if label[0] else 0, axis=1)

random_index = original_train_df.sample(frac=TRAIN_FRAC, random_state=RANDOM_SEED).index
test_df0 = original_train_df[~original_train_df.index.isin(random_index)]
train_df = original_train_df[original_train_df.index.isin(random_index)].reset_index(drop=True)

random_index = original_val_df.sample(frac=VAL_FRAC, random_state=RANDOM_SEED).index
test_df1 = original_val_df[~original_val_df.index.isin(random_index)]
val_df = original_val_df[original_val_df.index.isin(random_index)].reset_index(drop=True)

test_df = pd.concat([test_df0, test_df1], ignore_index=True)

enrich_df(train_df)
enrich_df(val_df)
enrich_df(test_df)

output_df = pd.DataFrame(
    [
        [TRAIN_KEY, len(train_df)],
        [TEST_KEY, len(val_df)],
        [VAL_KEY, len(test_df)]
    ],
    columns=['Key', 'Size']
)
print('Used datasets:')
display(HTML(output_df.to_html(index=False)))

print('Loading & preparation of dataframes are done.')

Original datasets:


Key,Path,Size
train,.\data\DaNetQA\train.jsonl,1749
test,.\data\DaNetQA\test.jsonl,821
val,.\data\DaNetQA\val.jsonl,805


Used datasets:


Key,Size
train,1574
test,739
val,257


Loading & preparation of dataframes are done.


## Define max_length

In [7]:
def define_raw_max_length_by_bert(sentences, raw_max_length):
    for sentence in sentences:
        input_ids = tokenizer.encode(sentence, add_special_tokens=True)
        raw_max_length = max(raw_max_length, len(input_ids))    
    return raw_max_length

def define_max_length(raw_max_length, threshold):
    return threshold if threshold >= raw_max_length else define_max_length(raw_max_length, threshold * 2)

raw_max_length = define_raw_max_length_by_bert(train_df.sentence.values, 0)
raw_max_length = define_raw_max_length_by_bert(val_df.sentence.values, raw_max_length)
raw_max_length = define_raw_max_length_by_bert(test_df.sentence.values, raw_max_length)

binary_based_max_length = define_max_length(raw_max_length, 1)
max_length = raw_max_length if USE_RAW_MAX_LENGTH else binary_based_max_length

output_df = pd.DataFrame(
    [
        ['Raw', raw_max_length],
        ['Binary-based', binary_based_max_length],
        ['Used', max_length]
    ],
    columns=['Type', 'Size']
)
print('Definition of max_length:')
display(HTML(output_df.to_html(index=False)))

print('Maximum length of sentences is defined.')

Definition of max_length:


Type,Size
Raw,838
Binary-based,1024
Used,838


Maximum length of sentences is defined.


## Datasets & dataloaders creation

In [8]:
class T5TestDataset(TorchDataset):
    def __init__(self, text, tokenizer, length, device):
        self._text = text.reset_index(drop=True)
        self._tokenizer = tokenizer
        self._length = length
        self._device = device

    def __len__(self):
        return self._text.shape[0]

    def __getitem__(self, item):
        output = self._tokenize(self._text[item])
        return {k: v.reshape(-1).to(self._device) for k, v in output.items()}

    def _tokenize(self, text):
        return self._tokenizer(text,
                               return_tensors='pt',
                               padding='max_length',
                               truncation=True,
                               max_length=self._length)


class T5TrainDataset(TorchDataset):
    POS_LABEL = 'верно'
    NEG_LABEL = 'неверно'

    def __init__(self, text, label, tokenizer, length, device):
        self._text = text.reset_index(drop=True)
        self._label = label.reset_index(drop=True)
        self._tokenizer = tokenizer
        self._length = length
        self._device = device

    def __len__(self):
        return self._label.shape[0]

    def __getitem__(self, item):
        output = self._tokenize(self._text[item], self._length)
        output = {k: v.reshape(-1).to(self._device) for k, v in output.items()}

        label = self.POS_LABEL if self._label[item] == 1 else self.NEG_LABEL
        label = self._tokenize(label, length=2).input_ids.reshape(-1).to(self._device)

        output.update({'labels': label})
        return output

    def _tokenize(self, text, length):
        return self._tokenizer(text,
                               return_tensors='pt',
                               padding='max_length',
                               truncation=True,
                               max_length=length)

    
train_dataset = T5TrainDataset(train_df['sentence'], train_df['l'], tokenizer, max_length, device)
val_dataset = T5TrainDataset(val_df['sentence'], val_df['l'], tokenizer, max_length, device)
test_dataset = T5TestDataset(test_df['sentence'], tokenizer, max_length, device)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print('Datasets & dataloaders creation is done')

Datasets & dataloaders creation is done


## Scheduler creation

In [9]:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader) * EPOCHS
)

print('Scheduler is created')

Scheduler is created


## Training

In [None]:
model.to(device)
# n_epochs = conf('train.epochs.t5')

dl_length = len(train_dataloader)

# <
print(f'dataloader len: {dl_length}')

for epoch in range(EPOCHS):
    print(f'\n======= Epoch {epoch + 1} / {EPOCHS} =======\n')

    print('Training...\n')
    model.train()
    
    for batch_id, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        # < del
        print(f'{batch_id} {datetime.datetime.now()}')
        if (((batch_id + 1) % 10 == 0) and not batch_id == 0) or (batch_id == dl_length - 1):
            print(f'\t\tBatch {batch_id+1} of {dl_length}, loss : {loss.item():.3f}')
        
    print('Validation...')
    model.eval()
    
    with torch.no_grad():
        eval_loss = [model(**batch).loss.item() for batch in eval_dataloader]
        
    print(f'\tValidation loss: {np.sum(eval_loss)/len(eval_loss)}')
    
print('Trainig complete!')

dataloader len: 197


Training...

0 2023-08-20 17:14:04.336524
1 2023-08-20 17:14:37.974944
2 2023-08-20 17:15:10.658004
3 2023-08-20 17:15:43.804539
4 2023-08-20 17:16:18.112430
5 2023-08-20 17:16:50.570007
6 2023-08-20 17:17:23.995431
7 2023-08-20 17:17:56.435516
8 2023-08-20 17:18:29.899453
9 2023-08-20 17:19:02.346536
		Batch 10 of 197, loss : 7.695
10 2023-08-20 17:19:35.819505
11 2023-08-20 17:20:08.259295
12 2023-08-20 17:20:41.686737
13 2023-08-20 17:21:14.102651
14 2023-08-20 17:21:47.547972
15 2023-08-20 17:22:20.013483
16 2023-08-20 17:22:53.441986
17 2023-08-20 17:23:25.874492
18 2023-08-20 17:23:59.306712
19 2023-08-20 17:24:31.740938
		Batch 20 of 197, loss : 2.888
20 2023-08-20 17:25:05.303006
21 2023-08-20 17:25:37.799561
22 2023-08-20 17:26:11.187247
23 2023-08-20 17:26:43.683505
24 2023-08-20 17:27:17.126828
25 2023-08-20 17:27:49.575983
26 2023-08-20 17:28:23.003659
27 2023-08-20 17:28:55.484950
28 2023-08-20 17:29:29.128305
29 2023-08-20 17:30:01.584497
		Batch 30 

## Testing

In [42]:

## test

# test_acceptable = test_dataframe.acceptable
# print(f'Positive samples: {test_acceptable.sum()} of {len(test_acceptable)} ({100.0*test_acceptable.sum()/len(test_acceptable):.2f}%)')

## Summary

In [43]:
###

## Saving

In [None]:
###