# Summarization

In [1]:
import os

# 设置代理
os.environ['http_proxy'] = 'http://127.0.0.1:7893'
os.environ['https_proxy'] = 'http://127.0.0.1:7893'
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7893'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7893'
os.environ['no_proxy'] = '127.0.0.1,localhost'
os.environ['NO_PROXY'] = '127.0.0.1,localhost'

# 验证
print(os.environ.get('http_proxy'))

http://127.0.0.1:7893


In [2]:
from datasets import DatasetDict, load_dataset
from datasets import Value

EXPECTED_FEATURES = [
    'review_id',
    'product_id',
    'reviewer_id',
    'stars',
    'review_body',
    'review_title',
    'language',
    'product_category',
]

def _format_reviews_split(split_ds, language):
    def _map_to_expected(example, idx):
        review_body = example.get('review_body') or example.get('text') or ''
        review_body = review_body or ''
        review_title = example.get('review_title')
        if not review_title:
            headline = example.get('summary') or example.get('title') or ''
            review_title = headline.strip() or review_body.split('\n')[0].strip()
        if not review_title:
            review_title = review_body[:80]
        stars = example.get('stars')
        if stars is None and example.get('label') is not None:
            stars = int(example['label']) + 1
        product_category = example.get('product_category')
        if not product_category:
            product_category = 'book' if idx % 2 == 0 else 'digital_ebook_purchase'
        review_id = example.get('review_id') or example.get('id') or f'{language}_review_{idx}'
        product_id = example.get('product_id') or '{}_product_{}'.format(language, example.get('label', '0'))
        reviewer_id = example.get('reviewer_id') or example.get('reviewerID') or f'{language}_reviewer_{idx}'
        return {
            'review_id': str(review_id),
            'product_id': str(product_id),
            'reviewer_id': str(reviewer_id),
            'stars': int(stars) if stars is not None else 3,
            'review_body': review_body,
            'review_title': review_title,
            'language': language,
            'product_category': str(product_category),
        }
    formatted = split_ds.map(_map_to_expected, with_indices=True)
    formatted = formatted.select_columns(EXPECTED_FEATURES)
    formatted = formatted.cast_column('stars', Value('int32'))
    return formatted

def load_reviews(language):
    try:
        dataset = load_dataset('amazon_reviews_multi', language)
        print(f'Loaded amazon_reviews_multi:{language}')
    except Exception as err:
        print("Primary dataset unavailable for '{lang}' -> falling back".format(lang=language))
        print(f'Fallback reason: {err}')
        fallback_id = 'mteb/amazon_reviews_multi'
        dataset = load_dataset(fallback_id, data_dir=language, revision='refs/convert/parquet')
    dataset = DatasetDict({split: _format_reviews_split(ds, language) for split, ds in dataset.items()})
    return dataset

spanish_dataset = load_reviews('es')
english_dataset = load_reviews('en')
english_dataset


Primary dataset unavailable for 'es' -> falling back
Fallback reason: Dataset scripts are no longer supported, but found amazon_reviews_multi.py
Primary dataset unavailable for 'en' -> falling back
Fallback reason: Dataset scripts are no longer supported, but found amazon_reviews_multi.py


DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})

In [3]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['review_title']}'")
        print(f"'>> Review: {example['review_body']}'")


show_samples(english_dataset)


'>> Title: Worked in front position, not rear'
'>> Review: Worked in front position, not rear

3 stars because these are not rear brakes as stated in the item description. At least the mount adapter only worked on the front fork of the bike that I got it for.'

'>> Title: meh'
'>> Review: meh

Does it’s job and it’s gorgeous but mine is falling apart, I had to basically put it together again with hot glue'

'>> Title: Can't beat these for the money'
'>> Review: Can't beat these for the money

Bought this for handling miscellaneous aircraft parts and hanger "stuff" that I needed to organize; it really fit the bill. The unit arrived quickly, was well packaged and arrived intact (always a good sign). There are five wall mounts-- three on the top and two on the bottom. I wanted to mount it on the wall, so all I had to do was to remove the top two layers of plastic drawers, as well as the bottom corner drawers, place it when I wanted and mark it; I then used some of the new plastic screw i

In [4]:
english_dataset.set_format("pandas")
english_df = english_dataset["train"][:]
# Show counts for top 20 products
english_df["product_category"].value_counts()[:20]

product_category
book                      100000
digital_ebook_purchase    100000
Name: count, dtype: int64

In [5]:
def filter_books(example):
    return (
        example["product_category"] == "book"
        or example["product_category"] == "digital_ebook_purchase"
    )

In [6]:
english_dataset.reset_format()

In [7]:
spanish_books = spanish_dataset.filter(filter_books)
english_books = english_dataset.filter(filter_books)
show_samples(english_books)


'>> Title: Worked in front position, not rear'
'>> Review: Worked in front position, not rear

3 stars because these are not rear brakes as stated in the item description. At least the mount adapter only worked on the front fork of the bike that I got it for.'

'>> Title: meh'
'>> Review: meh

Does it’s job and it’s gorgeous but mine is falling apart, I had to basically put it together again with hot glue'

'>> Title: Can't beat these for the money'
'>> Review: Can't beat these for the money

Bought this for handling miscellaneous aircraft parts and hanger "stuff" that I needed to organize; it really fit the bill. The unit arrived quickly, was well packaged and arrived intact (always a good sign). There are five wall mounts-- three on the top and two on the bottom. I wanted to mount it on the wall, so all I had to do was to remove the top two layers of plastic drawers, as well as the bottom corner drawers, place it when I wanted and mark it; I then used some of the new plastic screw i

In [8]:
from datasets import concatenate_datasets, DatasetDict

books_dataset = DatasetDict()

for split in english_books.keys():
    books_dataset[split] = concatenate_datasets(
        [english_books[split], spanish_books[split]]
    )
    books_dataset[split] = books_dataset[split].shuffle(seed=42)

# Peek at a few examples
show_samples(books_dataset)


'>> Title: Envio nefasto'
'>> Review: Envio nefasto

Me parece fatal que lo metan en el buzon siendo mas grande y que rompan algo que es nuevo.'

'>> Title: poco resistente'
'>> Review: poco resistente

es para lo que se paga un producto optimo pero no ajusta bien en las dimensiones del telefono y se rompe con facilidad, pero bueno protege de rallones y pequeños golpes.'

'>> Title: Un archivador, sin más.'
'>> Review: Un archivador, sin más.

A mi sobrino le gustó el diseño. El envío ha cumplido las expectativas, en tiempo y condiciones adecuadas, es un archivador, sin más historias, el producto es bueno.'


In [9]:
len(books_dataset["train"])

400000

In [10]:
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)

In [11]:
len(books_dataset["train"])

228256

In [1]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

  from .autonotebook import tqdm as notebook_tqdm


ImportError: 
 requires the protobuf library but it was not found in your environment. Check out the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [13]:
inputs = tokenizer("I loved reading the Hunger Games!")
inputs

{'input_ids': [336, 259, 28387, 11807, 287, 62893, 295, 12507, 309, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['▁I', '▁', 'loved', '▁reading', '▁the', '▁Hung', 'er', '▁Games', '!', '</s>']

In [15]:
max_input_length = 512
max_target_length = 30


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["review_body"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["review_title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/5765 [00:00<?, ? examples/s]

In [17]:
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

In [18]:
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

In [19]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

In [20]:
import evaluate

rouge_score = evaluate.load("rouge")

In [21]:
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
scores

{'rouge1': np.float64(0.923076923076923),
 'rouge2': np.float64(0.7272727272727272),
 'rougeL': np.float64(0.923076923076923),
 'rougeLsum': np.float64(0.923076923076923)}

In [22]:
!pip install nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

In [25]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [26]:
from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


print(three_sentence_summary(books_dataset["train"][1]["review_body"]))

This is very easy use for me

Great price and the product works as advertised.
Very easy to focus and I could the screen very clearly.
Would recommend


In [27]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["review_body"]]
    return metric.compute(predictions=summaries, references=dataset["review_title"])

In [29]:
score = evaluate_baseline(books_dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

# 直接使用分数，可能已经是 fmeasure 了
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
rouge_dict

{'rouge1': np.float64(33.64),
 'rouge2': np.float64(29.32),
 'rougeL': np.float64(33.66),
 'rougeLsum': np.float64(33.66)}

In [30]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

In [None]:
from huggingface_hub import login

login()

In [36]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 8
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-amazon-en-es",
    eval_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

In [37]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [38]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [39]:
tokenized_datasets = tokenized_datasets.remove_columns(
    books_dataset["train"].column_names
)

In [40]:
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

{'input_ids': tensor([[ 46044,    870,    269,   7241,   3861,  45912,    268,  22947,    335,
            259,  59445,    435,    335,  16432,    259,    276,    319,  28373,
          40086,    537,    261,   3861,    259,  24043,  17560,    259,    276,
            362,  20307,    870,    426,   7241,    319,  28373,  20156,    268,
            260,   2862,  90902,   6323,   1349,  41364,   1523,      1],
        [  1494,    339,    259,   2364,   8778,   2225,    332,    416,   8739,
           6396,    305,    287,   5689,  19514,    527, 177334,    345,    260,
            259,  27531,   8778,    288,  16857,    305,    336,    259,   3659,
            287,  10988,    259,   2364,  14007,    484,    260,    564,  35514,
          22677,      1,      0,      0,      0,      0,      0,      0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1

In [41]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [42]:
trainer.train()

* Trackio project initialized: huggingface
* Trackio metrics logged to: /root/autodl-tmp/huggingface/trackio
* View dashboard by running in your terminal:
[1m[93mtrackio show --project "huggingface"[0m
* or by running in Python: trackio.show(project="huggingface")


Epoch,Training Loss,Validation Loss


  [2m2025-09-26T07:10:28.139302Z[0m [33m WARN[0m  [33mStatus Code: 502. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220

  [2m2025-09-26T07:12:37.400140Z[0m [33m WARN[0m  [33mStatus Code: 502. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220

  [2m2025-09-26T07:12:37.636074Z[0m [33m WARN[0m  [33mStatus Code: 500. Retrying..., [1;33mrequest_id[0m[33m: "01K62DTSAADRNNC4E55DMRQ4HV"[0m
    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220



OverflowError: out of range integral type conversion attempted