<a href="https://colab.research.google.com/github/Gryzly73/Xsum/blob/main/summarization_project_Xsum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#   Проект: Генерация и анализ саммари новостных текстов (XSum + T5)

##   Цель проекта:
Построить пайплайн, который:
1. Загружает новостные тексты (XSum dataset)
2. Генерирует краткие саммари с помощью модели T5
3. Сравнивает оригинал и саммари
4. Извлекает ключевые факты
5. Классифицирует тексты по категориям
6. Выделяет теги (ключевые слова)

##  Установка зависимостей

In [1]:
!rm -rf ~/.cache/huggingface/datasets
!pip install "datasets<4.0.0" evaluate transformers rouge-score nltk keybert sentence-transformers
!apt install git-lfs

Collecting datasets<4.0.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->s

In [2]:
import datasets

print("✅ Версия datasets:", datasets.__version__)

✅ Версия datasets: 3.6.0


##   Авторизация в Hugging Face

In [3]:
# import os
from huggingface_hub import login, whoami
from google.colab import userdata

# Получаем токен из переменной окружения (секрета)
# hf_token = os.environ.get("HF_TOKEN")
hf_token = userdata.get('HF_TOKEN')

if not hf_token:
    raise ValueError("❌ Переменная окружения HF_TOKEN не установлена.")

# Авторизация
login(token=hf_token)

# Проверка
print(whoami())

{'type': 'user', 'id': '65d22b6adf205f2d8c932d65', 'name': 'gryzly', 'fullname': 'Maksim', 'isPro': False, 'avatarUrl': '/avatars/bc167fa6d467b979ee2c6aa6f046e229.svg', 'orgs': [{'type': 'org', 'id': '660ad2ae2c198b9518dff959', 'name': 'gryzly73', 'fullname': 'gryazly73', 'avatarUrl': 'https://www.gravatar.com/avatar/bf994f819e42517fa164cecb56904fc8?d=retro&size=100', 'isEnterprise': False}], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'For Samary Project', 'role': 'fineGrained', 'createdAt': '2025-08-05T23:22:01.393Z', 'fineGrained': {'canReadGatedRepos': False, 'global': [], 'scoped': [{'entity': {'_id': '65d22b6adf205f2d8c932d65', 'type': 'user', 'name': 'gryzly'}, 'permissions': []}]}}}}


Еще раз почистим кэш

In [4]:
import shutil
import os

cache_dir = "/root/.cache/huggingface"
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)
    print("✅ Кэш HuggingFace удалён полностью.")
else:
    print("⚠️ Кэш HuggingFace не найден.")

✅ Кэш HuggingFace удалён полностью.


## Загружаем DataSet

In [5]:
from datasets import load_dataset
import requests
from io import BytesIO
from datasets import DatasetDict, Dataset
import pandas as pd

try:
    raw_datasets = load_dataset("EdinburghNLP/xsum")
    print(f"✅ XSum загружен успешно. Примеров в обучающей выборке: {len(raw_datasets['train'])}")

except Exception as e:
    print(f"Ошибка при загрузке XSum: {e}\nПробуем альтернативные методы...")

    try:
        # Альтернативный метод через Parquet
        raw_datasets = load_dataset(
            'parquet',
            data_files={
                'train': 'https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/main/train.parquet',
                'validation': 'https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/main/validation.parquet',
                'test': 'https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/main/test.parquet'
            }
        )
        print(f"XSum загружен через Parquet. Примеров в тренировочном наборе: {len(raw_datasets['train'])}")

    except Exception as e:
        print(f"Ошибка при загрузке через Parquet: {e}\nИспользуем датасет CNN/DailyMail...")

        # Резервный вариант
        raw_datasets = load_dataset("cnn_dailymail", "3.0.0")
        raw_datasets = raw_datasets.rename_columns({
            'article': 'document',
            'highlights': 'summary'
        })
        print(f"CNN/DailyMail загружен. Примеров в тренировочном наборе: {len(raw_datasets['train'])}")

# Проверка данных
sample = raw_datasets['train'][0]
print("\nПример документа:")
print(sample['document'][:200] + "...")
print("\nПример саммари:")
print(sample['summary'])
print(len(raw_datasets["train"]))


README.md: 0.00B [00:00, ?B/s]

xsum.py: 0.00B [00:00, ?B/s]

0000.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

✅ XSum загружен успешно. Примеров в обучающей выборке: 204045

Пример документа:
The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing w...

Пример саммари:
Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.
204045


## Загрузка предобученной модели и токенизатора (T5-small)

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Срежем датасет

In [7]:
print("Было:\n")
print(len(raw_datasets["train"]))
print(" документов")
raw_datasets["train"] = raw_datasets["train"].select(range(20000))
print("\nСтало:\n")
print(len(raw_datasets["train"]))
print(" документов")

Было:

204045
 документов

Стало:

20000
 документов


## Предобработка

In [8]:
max_input_length = 1024
max_target_length = 128
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

## Обучение модели

In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import DatasetDict
import nltk
nltk.download('punkt')

# 1. Загрузка модели и токенизатора
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


 !! Запустить если ваш датасет не загружается !!  Создание тестовых данных

In [None]:

train_data = [{"document": "This is a sample document.", "summary": "Sample summary"}]*1000
val_data = [{"document": "Validation document.", "summary": "Val summary"}]*100

tokenized_datasets = DatasetDict({
    "train": Dataset.from_dict({"document": [d["document"] for d in train_data],
                              "summary": [d["summary"] for d in train_data]}),
    "validation": Dataset.from_dict({"document": [d["document"] for d in val_data],
                                   "summary": [d["summary"] for d in val_data]})
})


Токенизация данных

In [13]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    targets = examples["summary"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = tokenized_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

Аргументы обучения

In [14]:
args = Seq2SeqTrainingArguments(
    output_dir="t5-small-finetuned-xsum",
    # evaluation_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to="none"
)

## Функция метрик

In [15]:
import evaluate
import numpy as np

rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Распаковываем и декодируем
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Преобразуем -100 в pad_token_id для корректного декодирования
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Убираем пробелы по краям
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Рассчитываем Rouge
    result = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Умножим на 100 для читаемости
    result = {k: round(v * 100, 4) for k, v in result.items()}
    return result

Downloading builder script: 0.00B [00:00, ?B/s]

Создание Trainer

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


## Запуск обучения

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.9494,2.673138,24.6439,5.3668,18.9357,18.9345


TrainOutput(global_step=1250, training_loss=3.008470458984375, metrics={'train_runtime': 826.1143, 'train_samples_per_second': 24.21, 'train_steps_per_second': 1.513, 'total_flos': 2706836029440000.0, 'train_loss': 3.008470458984375, 'epoch': 1.0})

Оценка модели после обучения

In [19]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 2.673137664794922, 'eval_rouge1': 24.6439, 'eval_rouge2': 5.3668, 'eval_rougeL': 18.9357, 'eval_rougeLsum': 18.9345, 'eval_runtime': 381.973, 'eval_samples_per_second': 29.667, 'eval_steps_per_second': 1.856, 'epoch': 1.0}


## Генерация саммари + сравнение

In [20]:
# ✅ Проверка CUDA
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"💻 Используется устройство: {device}")

print("\n📌 Примеры генерации после обучения:\n")
for i in range(3):
    input_ids = tokenizer.encode(
        prefix + raw_datasets["validation"][i]["document"],
        return_tensors="pt",
        truncation=True,
        max_length=max_input_length
    ).to(device)

    output_ids = model.generate(input_ids, max_length=max_target_length)
    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    print(f"\n🔹 Пример {i + 1}")
    print("📄 Документ:", raw_datasets["validation"][i]["document"][:300], "...")
    print("✅ Оригинальное саммари:", raw_datasets["validation"][i]["summary"])
    print("📝 Сгенерированное саммари:", pred)

💻 Используется устройство: cuda

📌 Примеры генерации после обучения:


🔹 Пример 1
📄 Документ: The ex-Reading defender denied fraudulent trading charges relating to the Sodje Sports Foundation - a charity to raise money for Nigerian sport.
Mr Sodje, 37, is jointly charged with elder brothers Efe, 44, Bright, 50 and Stephen, 42.
Appearing at the Old Bailey earlier, all four denied the offence. ...
✅ Оригинальное саммари: Former Premier League footballer Sam Sodje has appeared in court alongside three brothers accused of charity fraud.
📝 Сгенерированное саммари: The former defender and his brother are accused of trading with the Sodje Sports Foundation.

🔹 Пример 2
📄 Документ: Voges was forced to retire hurt on 86 after suffering the injury while batting during the County Championship draw with Somerset on 4 June.
Middlesex hope to have the Australian back for their T20 Blast game against Hampshire at Lord's on 3 August.
The 37-year-old has scored 230 runs in four first-c ...
✅ Оригинальн

In [21]:
from IPython.display import display, HTML

def display_comparison(example):
    input_text = example["document"]
    summary_gt = example["summary"]
    input_ids = tokenizer(prefix + input_text, return_tensors="pt", truncation=True).input_ids
    generated_ids = model.generate(input_ids, max_length=max_target_length)
    generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    html = f'''
    <h4>📰 Оригинал</h4><p>{input_text}</p>
    <h4>✅ Эталонное саммари</h4><p>{summary_gt}</p>
    <h4>🤖 Сгенерированное саммари</h4><p>{generated_summary}</p>
    '''
    display(HTML(html))

## Извлечение ключевых фактов (через prompt к T5)

In [22]:
from transformers import pipeline
t5_pipe = pipeline("text2text-generation", model="t5-small", tokenizer="t5-small")

def extract_key_facts(text):
    prompt = f"extract facts: {text}"
    result = t5_pipe(prompt, max_length=80, do_sample=False)
    return result[0]["generated_text"]

Device set to use cuda:0


## Классификация и теги

In [23]:
from transformers import pipeline
from keybert import KeyBERT

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
kw_model = KeyBERT()

def classify_text(text):
    labels = ["Politics", "Health", "Technology", "Sports", "Business", "Entertainment", "Science", "World"]
    result = classifier(text, labels)
    return result["labels"][:3]

def extract_tags(text, top_n=5):
    return [kw[0] for kw in kw_model.extract_keywords(text, top_n=top_n)]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Полная демонстрация (пример)

In [24]:
example = raw_datasets["train"][3]
display_comparison(example)

print("🔍 Факты:")
print(extract_key_facts(example["document"]))

print("🏷 Категории:", classify_text(example["document"]))
print("🔑 Теги:", extract_tags(example["document"]))

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)