## Import

In [17]:
import random
import torch
import os
import time
import datetime
import shutil

import numpy as np
import pandas as pd
from IPython.display import HTML

from torch.optim import AdamW
from torch.utils.data import Dataset as TorchDataset
from torch.utils.data import (
    TensorDataset,
    DataLoader,
    RandomSampler,
    SequentialSampler
)
from transformers import (
    get_linear_schedule_with_warmup,
    T5Tokenizer,
    T5ForConditionalGeneration
)
from sklearn.metrics import (
    f1_score,
    matthews_corrcoef
)

print('Import is done.')

Import is done.


## Config

In [32]:
RANDOM_SEED = 42

TRAIN_KEY = 'train'
TEST_KEY = 'test'
VAL_KEY = 'val'

DATA_DIRECTORY = '.\\data'
DANETQA_INPUT = os.path.join(DATA_DIRECTORY, 'danetqa_paths.json')

SAVED_MODEL_PATH = os.path.join(DATA_DIRECTORY, 'saved')

PREPARED_BERT_PATH = 'ai-forever/ruBert-base'
PREPARED_T5_BASE_PATH = 'ai-forever/ruT5-base'

LR = 2e-5
EPS = 1e-8
TRAIN_FRAC = 0.9
VAL_FRAC = 0.9
BATCH_SIZE = 8
EPOCHS = 1

SENTENCE_COLS = ['question', 'passage']

USE_RAW_MAX_LENGTH = True
USE_BY_THRESHOLD = False
TOKEN_SIZE_THRESHOLD = 300

TESTING_FROM_FILES = True

print('Constants are initialized.')

Constants are initialized.


## Set random seed

In [25]:
random.seed(RANDOM_SEED) # check needness
np.random.seed(RANDOM_SEED) # check needness
torch.manual_seed(RANDOM_SEED) # check needness
torch.cuda.manual_seed(RANDOM_SEED) # check needness

print('Random seed is set.')

Random seed is set.


## Define device

In [26]:
if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')
    device = torch.device('cuda')
else:
    print('No GPU available, using the GPU instead.')
    device = torch.device('cpu')
    
print('Device is defined.')

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3080 Laptop GPU
Device is defined.


## Model & tokenizer & optimizer creation

In [27]:
tokenizer = T5Tokenizer.from_pretrained(PREPARED_T5_BASE_PATH, use_fast=False)
model = T5ForConditionalGeneration.from_pretrained(PREPARED_T5_BASE_PATH)
optimizer = AdamW(model.parameters(), lr=LR, eps=EPS)


print('Model, tokenizer, optimizer are created.')

Model, tokenizer, optimizer are created.


## Loading & preparation of dataframes

In [28]:
paths_df = pd.read_json(DANETQA_INPUT, lines=True)

train_file_path = paths_df.get(TRAIN_KEY).values[0]
test_file_path = paths_df.get(TEST_KEY).values[0]
val_file_path = paths_df.get(VAL_KEY).values[0]

original_train_df = pd.read_json(train_file_path, lines=True)
original_val_df = pd.read_json(val_file_path, lines=True)
original_test_df = pd.read_json(test_file_path, lines=True)

output_df = pd.DataFrame(
    [
        [TRAIN_KEY, train_file_path, len(original_train_df)],
        [TEST_KEY, test_file_path, len(original_val_df)],
        [VAL_KEY, val_file_path, len(original_test_df)]
    ],
    columns=['Key', 'Path', 'Size']
)
print('Original datasets:')
display(HTML(output_df.to_html(index=False)))

def enrich_df(df) -> None:
    df['sentence'] = df[SENTENCE_COLS].apply(lambda row: ''.join(row).lower(), axis=1)
    df['l'] = df[['label']].apply(lambda label: 1 if label[0] else 0, axis=1)

random_index = original_train_df.sample(frac=TRAIN_FRAC, random_state=RANDOM_SEED).index
test_df0 = original_train_df[~original_train_df.index.isin(random_index)]
train_df = original_train_df[original_train_df.index.isin(random_index)].reset_index(drop=True)

random_index = original_val_df.sample(frac=VAL_FRAC, random_state=RANDOM_SEED).index
test_df1 = original_val_df[~original_val_df.index.isin(random_index)]
val_df = original_val_df[original_val_df.index.isin(random_index)].reset_index(drop=True)

test_df = pd.concat([test_df0, test_df1], ignore_index=True)

enrich_df(train_df)
enrich_df(val_df)
enrich_df(test_df)

output_df = pd.DataFrame(
    [
        [TRAIN_KEY, len(train_df)],
        [TEST_KEY, len(val_df)],
        [VAL_KEY, len(test_df)]
    ],
    columns=['Key', 'Size']
)
print('Used datasets:')
display(HTML(output_df.to_html(index=False)))

print('Loading & preparation of dataframes are done.')

Original datasets:


Key,Path,Size
train,.\data\DaNetQA\train.jsonl,1749
test,.\data\DaNetQA\test.jsonl,821
val,.\data\DaNetQA\val.jsonl,805


Used datasets:


Key,Size
train,1574
test,739
val,257


Loading & preparation of dataframes are done.


## Define max_length

In [29]:
def define_raw_max_length_by_bert(sentences, raw_max_length):
    for sentence in sentences:
        input_ids = tokenizer.encode(sentence, add_special_tokens=True)
        raw_max_length = max(raw_max_length, len(input_ids))    
    return raw_max_length

def define_max_length(raw_max_length, threshold):
    return threshold if threshold >= raw_max_length else define_max_length(raw_max_length, threshold * 2)

raw_max_length = define_raw_max_length_by_bert(train_df.sentence.values, 0)
raw_max_length = define_raw_max_length_by_bert(val_df.sentence.values, raw_max_length)
raw_max_length = define_raw_max_length_by_bert(test_df.sentence.values, raw_max_length)

binary_based_max_length = define_max_length(raw_max_length, 1)
if USE_BY_THRESHOLD:
    max_length = TOKEN_SIZE_THRESHOLD
else:
    max_length = raw_max_length if USE_RAW_MAX_LENGTH else binary_based_max_length

output_df = pd.DataFrame(
    [
        ['Raw', raw_max_length],
        ['Binary-based', binary_based_max_length],
        ['By-threshold', TOKEN_SIZE_THRESHOLD],
        ['Used', max_length]
    ],
    columns=['Type', 'Size']
)
print('Definition of max_length:')
display(HTML(output_df.to_html(index=False)))

print('Maximum length of sentences is defined.')

Definition of max_length:


Type,Size
Raw,838
Binary-based,1024
By-threshold,300
Used,838


Maximum length of sentences is defined.


## Datasets & dataloaders creation

In [30]:
class T5TestDataset(TorchDataset):
    def __init__(self, text, tokenizer, length, device):
        self._text = text.reset_index(drop=True)
        self._tokenizer = tokenizer
        self._length = length
        self._device = device

    def __len__(self):
        return self._text.shape[0]

    def __getitem__(self, item):
        output = self._tokenize(self._text[item])
        return {k: v.reshape(-1).to(self._device) for k, v in output.items()}

    def _tokenize(self, text):
        return self._tokenizer(text,
                               return_tensors='pt',
                               padding='max_length',
                               truncation=True,
                               max_length=self._length)


class T5TrainDataset(TorchDataset):
    POS_LABEL = 'верно'
    NEG_LABEL = 'неверно'

    def __init__(self, text, label, tokenizer, length, device):
        self._text = text.reset_index(drop=True)
        self._label = label.reset_index(drop=True)
        self._tokenizer = tokenizer
        self._length = length
        self._device = device

    def __len__(self):
        return self._label.shape[0]

    def __getitem__(self, item):
        output = self._tokenize(self._text[item], self._length)
        output = {k: v.reshape(-1).to(self._device) for k, v in output.items()}

        label = self.POS_LABEL if self._label[item] == 1 else self.NEG_LABEL
        label = self._tokenize(label, length=2).input_ids.reshape(-1).to(self._device)

        output.update({'labels': label})
        return output

    def _tokenize(self, text, length):
        return self._tokenizer(text,
                               return_tensors='pt',
                               padding='max_length',
                               truncation=True,
                               max_length=length)

    
train_dataset = T5TrainDataset(train_df['sentence'], train_df['l'], tokenizer, max_length, device)
val_dataset = T5TrainDataset(val_df['sentence'], val_df['l'], tokenizer, max_length, device)
test_dataset = T5TestDataset(test_df['sentence'], tokenizer, max_length, device)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print('Datasets & dataloaders creation is done')

Datasets & dataloaders creation is done


## Scheduler creation

In [31]:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader) * EPOCHS
)

print('Scheduler is created')

Scheduler is created


## Training

In [10]:
model.to(device)
# n_epochs = conf('train.epochs.t5')

dl_length = len(train_dataloader)

# <
print(f'dataloader len: {dl_length}')

for epoch in range(EPOCHS):
    print(f'\n======= Epoch {epoch + 1} / {EPOCHS} =======\n')

    print('Training...\n')
    model.train()
    
    for batch_id, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        # < del
        print(f'{batch_id} {datetime.datetime.now()}')
        if (((batch_id + 1) % 10 == 0) and not batch_id == 0) or (batch_id == dl_length - 1):
            print(f'\t\tBatch {batch_id+1} of {dl_length}, loss : {loss.item():.3f}')
        
    print('Validation...')
    model.eval()
    
    with torch.no_grad():
        eval_loss = [model(**batch).loss.item() for batch in val_dataloader]
        
    print(f'\tValidation loss: {np.sum(eval_loss)/len(eval_loss)}')
    
print('Trainig complete!')

dataloader len: 197


Training...

0 2023-08-21 19:05:46.600186
1 2023-08-21 19:06:20.228428
2 2023-08-21 19:06:52.904405
3 2023-08-21 19:07:26.064761
4 2023-08-21 19:08:00.359646
5 2023-08-21 19:08:32.811278
6 2023-08-21 19:09:06.240639
7 2023-08-21 19:09:38.657371
8 2023-08-21 19:10:12.074097
9 2023-08-21 19:10:44.506522
		Batch 10 of 197, loss : 7.695
10 2023-08-21 19:11:17.909089
11 2023-08-21 19:11:50.313613
12 2023-08-21 19:12:23.717609
13 2023-08-21 19:12:56.143276
14 2023-08-21 19:13:29.539047
15 2023-08-21 19:14:01.962787
16 2023-08-21 19:14:35.340379
17 2023-08-21 19:15:07.766043
18 2023-08-21 19:15:41.178362
19 2023-08-21 19:16:13.618604
		Batch 20 of 197, loss : 2.888
20 2023-08-21 19:16:47.024691
21 2023-08-21 19:17:19.440448
22 2023-08-21 19:17:52.833131
23 2023-08-21 19:18:25.258452
24 2023-08-21 19:18:58.659251
25 2023-08-21 19:19:31.072009
26 2023-08-21 19:20:04.475222
27 2023-08-21 19:20:36.896101
28 2023-08-21 19:21:10.287093
29 2023-08-21 19:21:42.690193
		Batch 30 

## Testing

In [35]:
def test(model, tokenizer, dataloader):
    pos_label = tokenizer(T5TrainDataset.POS_LABEL,
                          return_tensors='pt',
                          padding='max_length',
                          truncation=True,
                          max_length=2)['input_ids'][0][0].item()
    model.eval()

    result = np.array([])
    # <
    l = len(dataloader)
    # <
    for batch_id, batch in enumerate(dataloader):
        # <
        print(f'{batch_id} {datetime.datetime.now()}')
        # <
        tokens = model.generate(**batch)
        tokens = [1 if pos_label in token else 0 for token in tokens]
        result = np.hstack([result, tokens])
    
    return result

if TESTING_FROM_FILES:
    tokenizer_from_disk = T5Tokenizer.from_pretrained(SAVED_MODEL_PATH, use_fast=False)
    model_from_disk = T5ForConditionalGeneration.from_pretrained(SAVED_MODEL_PATH)
    model_from_disk.to(device)
    result = test(model_from_disk, tokenizer_from_disk, test_dataloader)
else:
    result = test(model, tokenizer, test_dataloader)

f1 = f1_score(result, test_df['l'])

print(f'T5 testing is done, F1-score: {f1:.3f}')

0 2023-08-22 21:55:46.496699




1 2023-08-22 21:55:47.620293
2 2023-08-22 21:55:47.962932
3 2023-08-22 21:55:48.303195
4 2023-08-22 21:55:48.644253
5 2023-08-22 21:55:48.985396
6 2023-08-22 21:55:49.327234
7 2023-08-22 21:55:49.671743
8 2023-08-22 21:55:50.019898
9 2023-08-22 21:55:50.362977
10 2023-08-22 21:55:50.706875
11 2023-08-22 21:55:51.052055
12 2023-08-22 21:55:51.394132
13 2023-08-22 21:55:51.735009
14 2023-08-22 21:55:52.080592
15 2023-08-22 21:55:52.429671
16 2023-08-22 21:55:52.775660
17 2023-08-22 21:55:53.119738
18 2023-08-22 21:55:53.462799
19 2023-08-22 21:55:53.805876
20 2023-08-22 21:55:54.152954
21 2023-08-22 21:55:54.500035
22 2023-08-22 21:55:54.844032
23 2023-08-22 21:55:55.188110
24 2023-08-22 21:55:55.534187
25 2023-08-22 21:55:55.879266
26 2023-08-22 21:55:56.226312
27 2023-08-22 21:55:56.571396
28 2023-08-22 21:55:56.915487
29 2023-08-22 21:55:57.261575
30 2023-08-22 21:55:57.604571
31 2023-08-22 21:55:57.951648
32 2023-08-22 21:55:58.287828
T5 testing is done, F1-score: 0.652


## Saving

In [16]:

if os.path.isdir(SAVED_MODEL_PATH):
    for file_name in os.listdir(SAVED_MODEL_PATH):
        file_path = os.path.join(SAVED_MODEL_PATH, file_name)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(e)

tokenizer.save_pretrained(SAVED_MODEL_PATH)
tokenizer.save_vocabulary(SAVED_MODEL_PATH)
model.save_pretrained(SAVED_MODEL_PATH)

print('Saving is done.')

Saving is done.


## End