In [6]:
from transformers import AutoTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader

import json

In [32]:
class InstructionDataset(Dataset):
    def __init__(self, tokenizer, ds, max_length: int=1024):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ds = ds
        self.formatted_data = []

        for item in self.ds:
            text = f"### Инструкция: {item['instruction']}"
            if len(item['input']) > 0:
                text += f"### Запрос: {item['input']}"
            text += f"Ответ: {item['output']}{self.tokenizer.eos_token}"            
            self.formatted_data.append(text)

    def __len__(self):
        return len(self.formatted_data)

    def __getitem__(self, idx):
        text = self.formatted_data[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

In [33]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruGPT-3.5-13B")
ds = load_dataset('freQuensy23/ru-alpaca-cleaned') # "ai-bond/ru-alpaca-grandpro"
ds = ds['train'].train_test_split(test_size=0.15)

In [34]:
train_dataset = InstructionDataset(
    tokenizer=tokenizer,
    ds=ds['train'], max_length=256
)

test_dataset = InstructionDataset(
    tokenizer=tokenizer,
    ds=ds['test'], max_length=256
)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=True,
    num_workers=4
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=2,
    shuffle=True,
    num_workers=4
)