In [15]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

In [97]:
class StoryDataset(Dataset):
    def __init__(self, finals, backstories, tokenizer, max_len=128):
        self.final = finals
        self.backstory = backstories
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.final)

    def __getitem__(self, idx):
        final = self.final[idx]
        backstory = self.backstory[idx]

        # Токенизация
        inputs = self.tokenizer.encode_plus(
            f"predict_backstory: {final}",
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        targets = self.tokenizer.encode_plus(
            backstory,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }

In [99]:
def load_data(csv_file):
    df = pd.read_csv(csv_file, sep=';')
    finals = df['Final'].values
    backstories = df['Backstory'].values
    print('finals: ', finals)
    print("Образец данных:")
    for i, row in df.iterrows():
        print(f"Final: {row['Final']}")
        print(f"Backstory: {row['Backstory']}")
        if i > 5:
            break
    
    return finals, backstories

In [101]:
def train_model(csv_file, model_name='t5-small', epochs=3, batch_size=8, max_len=128):
    # Загрузка данных
    finals, backstories = load_data(csv_file)
    train_finals, val_finals, train_backstories, val_backstories = train_test_split(
        finals, backstories, test_size=0.1, random_state=42
    )

    # Инициализация токенизатора и модели
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    inputs = tokenizer(
        f"predict_backstory: {final}",
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )

    model = T5ForConditionalGeneration.from_pretrained(model_name)
    device = torch.device('cpu')
    model.to(device)

    # Создание датасетов и загрузчиков
    train_dataset = StoryDataset(train_finals, train_backstories, tokenizer, max_len)
    val_dataset = StoryDataset(val_finals, val_backstories, tokenizer, max_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Оптимизатор
    optimizer = AdamW(model.parameters(), lr=3e-4)

    # Цикл обучения
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            # inputs = {
            #     'input_ids': batch['input_ids'].to(device),
            #     'attention_mask': batch['attention_mask'].to(device),
            #     'labels': batch['labels'].to(device)
            # }
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")

        # Валидация
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = {
                    'input_ids': batch['input_ids'].to(device),
                    'attention_mask': batch['attention_mask'].to(device),
                    'labels': batch['labels'].to(device)
                }
                outputs = model(**inputs)
                val_loss += outputs.loss.item()
        print(f"Validation Loss: {val_loss / len(val_loader):.4f}")

    # Сохранение модели
    model.save_pretrained('story_generator_model')
    tokenizer.save_pretrained('story_generator_model')
    print("Модель успешно сохранена.")
    
    print("Проверка весов модели...")
    for name, param in model.named_parameters():
        print(name, param.shape)


In [103]:
# 4. Генерация текста
def generate_backstory(final, model_path='story_generator_model'):
    # Загрузка модели и токенизатора
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(model_path).to('cpu')

    # Токенизация входных данных
    inputs = tokenizer(
        f"predict_backstory: {final}",
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=512
    ).to('cpu')

    # Генерация текста
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=256,
        num_beams=5,
        early_stopping=True,
        temperature=0.7,
        do_sample=True
    )

    # Декодирование текста
    decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Отладочные выводы
    print("Input IDs:", inputs['input_ids'])
    print("Attention Mask:", inputs['attention_mask'])
    print("Generated output IDs:", outputs)
    print("Decoded text:", decoded_text)

    return decoded_text

In [105]:
# Обучение модели
train_model('story.csv', epochs=25, batch_size=10)
# Генерация текста
final = "He returned home and smiled."
backstory = generate_backstory(final)
print("Сгенерированная основа истории:", backstory)

finals:  ['He returned home and smiled.' 'She dreamed of adventure.'
 'They celebrated their victory.' 'The sun set behind the mountains.'
 'She hugged him tightly.' 'The door slammed shut.'
 'He woke up in a cold sweat.' 'The crowd cheered loudly.'
 'The letter brought tears to her eyes.' 'The storm passed, leaving calm.']
Образец данных:
Final: He returned home and smiled.
Backstory: He went to the store. He bought milk. He met an old friend on the way.
Final: She dreamed of adventure.
Backstory: She read a book. She fell asleep.
Final: They celebrated their victory.
Backstory: They trained hard for months. They won the championship game.
Final: The sun set behind the mountains.
Backstory: They hiked all day. They reached the summit just in time.
Final: She hugged him tightly.
Backstory: He surprised her with flowers. She hadn’t seen him in years.
Final: The door slammed shut.
Backstory: She heard footsteps behind her. She ran as fast as she could.
Final: He woke up in a cold sweat.


ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds