In [96]:
from transformers import HerbertTokenizer, RobertaModel
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

In [27]:
torch.cuda.is_available()

True

In [28]:
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMTokenizer'. 
The class this function is called from is 'HerbertTokenizer'.


In [69]:
df = pd.read_csv('../dataset/dataset.csv', sep='$', encoding='utf-8')

# Tokenization

In [70]:
encoded_corpus = tokenizer(text=df['user_comment'].tolist(),
                            add_special_tokens=True,
                            padding='max_length',
                            truncation='longest_first',
                            max_length=256,
                            return_attention_mask=True)

In [71]:
input_ids = encoded_corpus['input_ids']
attention_mask = encoded_corpus['attention_mask']

# Split data

In [88]:
X = np.array(input_ids)
y = df['user_rate'].to_numpy()

In [73]:
print(X.shape)
print(y.shape)

(4281, 256)
(4281,)


In [78]:
seed = 32

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)
train_mask, test_mask, train_mask_y, test_mask_y = train_test_split(attention_mask, y, test_size=0.2, stratify=y, random_state=seed)

In [86]:
len(train_mask)

3424

In [90]:
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=seed)
test_mask, valid_mask, _, _ = train_test_split(test_mask, test_mask_y, test_size=0.5, stratify=test_mask_y, random_state=seed)

In [94]:
y_test

array([ 4,  9,  5,  8,  8,  7,  9,  4,  8,  7,  3,  7,  6,  2,  8,  6,  7,
        7,  7,  7,  6,  7,  6,  7,  7,  9, 10,  9,  8,  2,  7,  7,  8,  7,
        8,  8,  7,  7,  6,  3,  8,  9,  5, 10,  8,  8, 10,  9,  7,  8,  1,
        7,  8,  7,  8,  8,  6,  5,  8,  6,  7,  5,  6,  7,  9,  8,  5,  7,
        7,  7,  8,  5,  8,  2,  9,  8,  7,  9,  9,  5,  8,  9,  3,  7,  7,
        8,  8,  5,  6,  7,  4,  8,  5,  6,  6,  8,  0,  9,  6,  8,  4,  5,
        5,  6,  1,  6, 10, 10,  6,  7,  6,  6,  2, 10,  5,  3,  3,  7,  7,
        0,  5,  8,  5,  7,  7,  6,  7,  8,  7, 10,  5,  6,  7,  7,  9,  9,
        8,  6,  8,  5,  6,  2,  7,  6,  7,  3,  9,  6,  6,  8,  8,  5,  4,
        6,  3,  6,  8,  4,  0,  6,  7,  7,  8,  5,  3,  5,  8,  8,  4,  9,
        7,  3,  7,  1, 10,  8,  7,  4,  8,  5,  7,  7,  4,  4,  7,  6,  2,
        7,  3,  3,  8,  4,  0,  7,  7,  8,  7,  6,  8,  4,  7,  8,  5,  7,
        5,  7,  6,  8,  6,  9,  8,  9,  8,  3,  8,  6,  7,  5,  6,  8,  8,
        7,  8,  5,  7,  9

Consider rescaling the target values

# Dataset preparation for pytorch

In [97]:

def prepare_dataloader(inputs, masks, labels, batch_size):
    input_tensor = torch.tensor(inputs)
    mask_tensor = torch.tensor(masks)
    labels_tensor = torch.tensor(labels)
    dataset = TensorDataset(input_tensor, mask_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_dataloader = prepare_dataloader(X_train, train_mask, y_train, 32)
validation_dataloader = prepare_dataloader(X_valid, valid_mask, y_valid, 32)
train_dataloader = prepare_dataloader(X_test, test_mask, y_test, 32)