In [1]:
import os
from tqdm import tqdm
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "home/user/models/rugpt",
    use_fast=True
)

In [None]:
ds = load_dataset('freQuensy23/ru-alpaca-cleaned', num_proc=8, trust_remote_code=True)
ds = ds['train'].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
ds['val'] = ds.pop('test')

In [2]:
class InstructionDataset(Dataset):
    def __init__(self, tokenizer, ds, max_length: int=1024):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ds = ds
        self.formatted_data = []

        for item in self.ds:
            text = f"### Инструкция: {item['instruction']}"
            if len(item['input']) > 0:
                text += f"### Запрос: {item['input']}"
            text += f"Ответ: {item['output']}{self.tokenizer.eos_token}"            
            self.formatted_data.append(text)

    def __len__(self):
        return len(self.formatted_data)

    def __getitem__(self, idx):
        text = self.formatted_data[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

In [7]:
train_dataset = InstructionDataset(
    tokenizer=tokenizer,
    ds=ds['train'], max_length=512
)

val_dataset = InstructionDataset(
    tokenizer=tokenizer,
    ds=ds['val'], max_length=512
)

In [None]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=8
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=8
)