In [94]:
!pip install --upgrade torchvision



In [96]:
import re
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
from rouge import Rouge
from tqdm import tqdm
import sentencepiece

In [98]:
def preprocess_text(text):
    # Удаление технических логов и шума
    text = re.sub(r'Heap \d+ bytes reserved', '', text)
    text = re.sub(r'Timestamp : \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '', text)
    text = re.sub(r'<newline>', '\n', text)  # Замена тегов на переносы строк
    text = re.sub(r'\s+', ' ', text)         # Удаление лишних пробелов
    return text.strip()


In [102]:
def split_into_stories(raw_text):
    # Разделение на истории по разделителю "..."
    stories = [preprocess_text(part) for part in raw_text.split("...") if part.strip()]
    return [story for story in stories if len(story.split()) > 50]  # Фильтрация слишком коротких текстов

In [104]:
def extract_final_and_backstory(text):
    # Разделение на финал (последние 2 предложения) и предысторию
    sentences = [s.strip() for s in re.split(r'[.!?]', text) if s.strip()]
    if len(sentences) < 3:
        return None, None
    final = '. '.join(sentences[-2:])
    backstory = '. '.join(sentences[:-2])
    return final, backstory

In [106]:
class WritingPromptsDataset(Dataset):
    def __init__(self, stories, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = []
        
        # Извлечение пар "финал-предыстория"
        for story in stories:
            final, backstory = extract_final_and_backstory(story)
            if final and backstory:
                self.data.append((final, backstory))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        final, backstory = self.data[idx]
        
        # Токенизация
        inputs = self.tokenizer.encode_plus(
            f"generate_prehistory: {final}",
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        targets = self.tokenizer.encode_plus(
            backstory,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }

In [114]:
def train_model(csv_file, model_name='t5-small', epochs=1, batch_size=32, max_len=256):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    device = torch.device('cpu')  # Вместо 'cuda'
    model.to(device)
    
    with open(csv_file, 'r') as f:
        raw_text = f.read()
    stories = split_into_stories(raw_text)[:1000]  # Используем только 1000 примеров
    
    dataset = WritingPromptsDataset(stories, tokenizer, max_len=max_len)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    optimizer = AdamW(model.parameters(), lr=3e-4)
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'labels': batch['labels'].to(device)
            }
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        print(f"Loss: {total_loss/len(train_loader):.4f}")
    
    try:
        print("Проверка весов модели...")
        for name, param in model.named_parameters():
            print(name, param.shape)
        
        model.save_pretrained('wp_reverse_model')
        tokenizer.save_pretrained('wp_reverse_model')
        print("Модель успешно сохранена.")
    except Exception as e:
        print(f"Ошибка при сохранении модели: {e}")

In [110]:
def generate_prehistory(final_part, model_path='wp_reverse_model'):
    print("Загрузка модели...")
    try:
        tokenizer = T5Tokenizer.from_pretrained(model_path)
        model = T5ForConditionalGeneration.from_pretrained(model_path).cpu()
        print("Модель загружена.")
    except Exception as e:
        print(f"Ошибка при загрузке модели: {e}")
        return None

    if not final_part.strip():
        raise ValueError("Входное предложение пустое!")

    print("Токенизация входных данных...")
    inputs = tokenizer(
        f"generate_prehistory: {final_part}",
        return_tensors='pt'
    ).to('cpu')

    try:
        print("Генерация текста...")
        outputs = model.generate(
            **inputs,
            max_length=150,  # Уменьшено для ускорения
            num_beams=3,     # Уменьшено для ускорения
            early_stopping=True
        )
        print("Генерация завершена.")
    except Exception as e:
        print(f"Ошибка при генерации: {e}")
        return None

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [112]:
train_model('valid.txt', epochs=1)
    
# Генерация предыстории
final = "Clancy slits Rob's throat and returns to HQ."
print("Сгенерированная предыстория:", generate_prehistory(final))

Epoch 1: 100%|██████████████████████████████████| 32/32 [12:20<00:00, 23.15s/it]


Loss: 4.3886
Проверка весов модели...
shared.weight torch.Size([32128, 512])
encoder.block.0.layer.0.SelfAttention.q.weight torch.Size([512, 512])
encoder.block.0.layer.0.SelfAttention.k.weight torch.Size([512, 512])
encoder.block.0.layer.0.SelfAttention.v.weight torch.Size([512, 512])
encoder.block.0.layer.0.SelfAttention.o.weight torch.Size([512, 512])
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight torch.Size([32, 8])
encoder.block.0.layer.0.layer_norm.weight torch.Size([512])
encoder.block.0.layer.1.DenseReluDense.wi.weight torch.Size([2048, 512])
encoder.block.0.layer.1.DenseReluDense.wo.weight torch.Size([512, 2048])
encoder.block.0.layer.1.layer_norm.weight torch.Size([512])
encoder.block.1.layer.0.SelfAttention.q.weight torch.Size([512, 512])
encoder.block.1.layer.0.SelfAttention.k.weight torch.Size([512, 512])
encoder.block.1.layer.0.SelfAttention.v.weight torch.Size([512, 512])
encoder.block.1.layer.0.SelfAttention.o.weight torch.Size([512, 512])
encoder.

NameError: name 'model' is not defined

In [10]:
import torch
print(torch.cuda.is_available())

False


In [76]:
# Пример использования
final = "Clancy slits Rob's throat and returns to HQ."
print("Сгенерированная предыстория:", generate_prehistory(final))
a = generate_prehistory(final)
with open('text1.txt', 'w') as file:
    file.write(a)
    

Load...
Load finaly
Сгенерированная предыстория: 
Load...
Load finaly


In [80]:
from transformers.utils import logging
logging.set_verbosity_warning()
print('1')

1


NameError: name 'df' is not defined