## Import

In [11]:
import random
import torch
import os

import numpy as np
import pandas as pd
from IPython.display import HTML

from torch.optim import AdamW
from transformers import (
    BertTokenizer,
    BertForSequenceClassification
)

print('Import is done.')

Import is done.


## Config

In [6]:
# INPUT_PATHS = 'paths.json'
RANDOM_SEED = 42

TRAIN_KEY = 'train.jsonl'
TEST_KEY = 'test.jsonl'
VAL_KEY = 'val.jsonl'

DATA_DIRECTORY = '.\\data'
# RUSSE_INPUT = os.path.join(DATA_DIRECTORY, 'russe_paths.json') #<
DANETQA_INPUT = os.path.join(DATA_DIRECTORY, 'danetqa_paths.json')

PREPARED_BERT_PATH = 'ai-forever/ruBert-base'
LR = 2e-5
EPS = 1e-8

print('Constants are initialized.')

Constants are initialized.


## Set random seed

In [3]:
random.seed(RANDOM_SEED) # check needness
np.random.seed(RANDOM_SEED) # check needness
torch.manual_seed(RANDOM_SEED) # check needness
torch.cuda.manual_seed(RANDOM_SEED) # check needness

print('Random seed is set.')

Random seed is set.


## Define device

In [4]:
if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')
    device = torch.device('cuda')
else:
    print('No GPU available, using the GPU instead.')
    device = torch.device('cpu')
    
print('Device is defined.')

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3080 Laptop GPU
Device is defined.


## Model & tokenizer & optimizer creation

In [9]:
tokenizer = BertTokenizer.from_pretrained(PREPARED_BERT_PATH)
model = BertForSequenceClassification.from_pretrained(
    PREPARED_BERT_PATH,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

optimizer = AdamW(model.parameters(), lr=LR, eps=EPS)

print('Model, tokenizer, optimizer are created.')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model, tokenizer, optimizer are created.


## Loading original dataframes from disk

In [21]:
paths_df = pd.read_json(DANETQA_INPUT, lines=True)
# paths_df = pd.read_json('./data/danetqa_paths.json', lines=True)

train_file_path = paths_df.get(TRAIN_KEY).values[0]
test_file_path = paths_df.get(TEST_KEY).values[0]
val_file_path = paths_df.get(VAL_KEY).values[0]

original_train_df = pd.read_json(train_file_path, lines=True)
original_val_df = pd.read_json(val_file_path, lines=True)
original_test_df = pd.read_json(test_file_path, lines=True)

output_df = pd.DataFrame(
    [
        [TRAIN_KEY, train_file_path, len(original_train_df)],
        [TEST_KEY, test_file_path, len(original_val_df)],
        [VAL_KEY, val_file_path, len(original_test_df)]
    ],
    columns=['Key', 'Path', 'size']
)
print('Datasets:')
HTML(output_df.to_html(index=False))

# original_train_df = pd.read_json(train_file_path, lines=True)
# original_val_df = pd.read_json(val_file_path, lines=True)
# original_test_df = pd.read_json(val_test_path, lines=True)

# output_df = pd.DataFrame(
#     [[TRAIN_KEY, len(original_train_df)], [TEST_KEY, test_file_path], [VAL_KEY, val_file_path]],
#     columns=['Key', 'Path']
# )
# print('Received next paths:')
# HTML(output_df.to_html(index=False))


# print(type(original_train_df))
# print(len(original_train_df))
# print(original_train_df.shape)

# df = pd.DataFrame(data, columns=['Type', 'F1'])
# HTML(df.to_html(index=False))

# train_eval_dataframe = pd.read_csv(conf('dataset.path.train'), usecols=conf('dataset.usecols'))
# test_dataframe = pd.read_csv(conf('dataset.path.test'), usecols=conf('dataset.usecols'))

# random_index = train_eval_dataframe.sample(frac=conf('train.size'), random_state=conf('random.seed')).index
# eval_dataframe = train_eval_dataframe[~train_eval_dataframe.index.isin(random_index)]
# train_dataframe = train_eval_dataframe[train_eval_dataframe.index.isin(random_index)]

# print('Dataframes are loaded.')
# print(f'Train dataframe size: {len(train_dataframe)}')
# print(f'Evaluate dataframe size: {len(eval_dataframe)}')
# print(f'Test dataframe size: {len(test_dataframe)}')

Datasets:


Key,Path,size
train.jsonl,.\data\DaNetQA\train.jsonl,1749
test.jsonl,.\data\DaNetQA\test.jsonl,821
val.jsonl,.\data\DaNetQA\val.jsonl,805
