## Import

In [1]:
import random
import torch
import os

import numpy as np
import pandas as pd
from IPython.display import HTML

from torch.optim import AdamW
from transformers import (
    BertTokenizer,
    BertForSequenceClassification
)

print('Import is done.')

  from .autonotebook import tqdm as notebook_tqdm


Import is done.


## Config

In [13]:
RANDOM_SEED = 42

TRAIN_KEY = 'train'
TEST_KEY = 'test'
VAL_KEY = 'val'

DATA_DIRECTORY = '.\\data'
DANETQA_INPUT = os.path.join(DATA_DIRECTORY, 'danetqa_paths.json')

PREPARED_BERT_PATH = 'ai-forever/ruBert-base'
LR = 2e-5
EPS = 1e-8
TRAIN_FRAC = 0.9
VAL_FRAC = 0.9

SENTENCE_COLS = ['passage', 'question']

print('Constants are initialized.')

Constants are initialized.


## Set random seed

In [3]:
random.seed(RANDOM_SEED) # check needness
np.random.seed(RANDOM_SEED) # check needness
torch.manual_seed(RANDOM_SEED) # check needness
torch.cuda.manual_seed(RANDOM_SEED) # check needness

print('Random seed is set.')

Random seed is set.


## Define device

In [4]:
if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')
    device = torch.device('cuda')
else:
    print('No GPU available, using the GPU instead.')
    device = torch.device('cpu')
    
print('Device is defined.')

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3080 Laptop GPU
Device is defined.


## Model & tokenizer & optimizer creation

In [5]:
tokenizer = BertTokenizer.from_pretrained(PREPARED_BERT_PATH)
model = BertForSequenceClassification.from_pretrained(
    PREPARED_BERT_PATH,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

optimizer = AdamW(model.parameters(), lr=LR, eps=EPS)

print('Model, tokenizer, optimizer are created.')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model, tokenizer, optimizer are created.


## Loading & preparation of dataframes

In [16]:
paths_df = pd.read_json(DANETQA_INPUT, lines=True)

train_file_path = paths_df.get(TRAIN_KEY).values[0]
test_file_path = paths_df.get(TEST_KEY).values[0]
val_file_path = paths_df.get(VAL_KEY).values[0]

original_train_df = pd.read_json(train_file_path, lines=True)
original_val_df = pd.read_json(val_file_path, lines=True)
original_test_df = pd.read_json(test_file_path, lines=True)

output_df = pd.DataFrame(
    [
        [TRAIN_KEY, train_file_path, len(original_train_df)],
        [TEST_KEY, test_file_path, len(original_val_df)],
        [VAL_KEY, val_file_path, len(original_test_df)]
    ],
    columns=['Key', 'Path', 'Size']
)
print('Original datasets:')
display(HTML(output_df.to_html(index=False)))

random_index = original_train_df.sample(frac=TRAIN_FRAC, random_state=RANDOM_SEED).index
test_df0 = original_train_df[~original_train_df.index.isin(random_index)]
train_df = original_train_df[original_train_df.index.isin(random_index)].reset_index(drop=True)
train_df['sentence'] = train_df[SENTENCE_COLS].apply(lambda row: ''.join(row).lower(), axis=1)

random_index = original_val_df.sample(frac=VAL_FRAC, random_state=RANDOM_SEED).index
test_df1 = original_val_df[~original_val_df.index.isin(random_index)]
val_df = original_val_df[original_val_df.index.isin(random_index)].reset_index(drop=True)
val_df['sentence'] = val_df[SENTENCE_COLS].apply(lambda row: ''.join(row).lower(), axis=1)

test_df = pd.concat([test_df0, test_df1], ignore_index=True)
test_df['sentence'] = test_df[SENTENCE_COLS].apply(lambda row: ''.join(row).lower(), axis=1)

output_df = pd.DataFrame(
    [
        [TRAIN_KEY, len(train_df)],
        [TEST_KEY, len(val_df)],
        [VAL_KEY, len(test_df)]
    ],
    columns=['Key', 'Size']
)
print('Used datasets:')
display(HTML(output_df.to_html(index=False)))



Original datasets:


Key,Path,Size
train,.\data\DaNetQA\train.jsonl,1749
test,.\data\DaNetQA\test.jsonl,821
val,.\data\DaNetQA\val.jsonl,805


Used datasets:


Key,Size
train,1574
test,739
val,257
