# 微调中文和日文的双语摘要任务模型

确保我们的模型不会过度拟合单一语言

### 预处理

In [1]:
from datasets import load_dataset

raw_dataset = load_dataset('csv',data_files='./data/amazon_reviews_multi/train.csv')
raw_dataset = raw_dataset.rename_column('Unnamed: 0','id')
raw_chinese_dataset = raw_dataset.filter(lambda x:x['language']=='zh')
raw_japanese_dataset = raw_dataset.filter(lambda x:x['language']=='ja')

split_chinese_dataset = raw_chinese_dataset['train'].train_test_split(train_size=0.9)
test_and_valid = split_chinese_dataset['test'].train_test_split(train_size=0.5)
chinese_dataset = split_chinese_dataset
chinese_dataset['validation'] = test_and_valid.pop('train')
chinese_dataset['test'] = test_and_valid.pop('test')

split_japanese_dataset = raw_japanese_dataset['train'].train_test_split(train_size=0.9)
test_and_valid = split_japanese_dataset['test'].train_test_split(train_size=0.5)
japanese_dataset = split_japanese_dataset
japanese_dataset['validation'] = test_and_valid.pop('train')
japanese_dataset['test'] = test_and_valid.pop('test')
print(chinese_dataset)
print(japanese_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 180000
    })
    test: Dataset({
        features: ['id', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['id', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 10000
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 180000
    })
    test: Dataset({
        features: ['id', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 10000
    })
    

In [2]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['review_title']}'")
        print(f"'>> Review: {example['review_body']}'")


show_samples(chinese_dataset)
show_samples(japanese_dataset)

chinese_dataset.set_format("pandas")
ch_df = chinese_dataset["train"][:]
# 显示前 20 个产品的数量
ch_df["product_category"].value_counts()[:20]


japanese_dataset.set_format("pandas")
ja_df = japanese_dataset["train"][:]
# 显示前 20 个产品的数量
ja_df["product_category"].value_counts()[:20]


'>> Title: 一两的时间就坏了一个发热管'
'>> Review: 去年买的，现在已经坏了一个，只剩下二个发热管了。。。。本来一直很信任亚马逊的东西'

'>> Title: 书有损'
'>> Review: 新书的价格，然而书的封皮及部分页面弄湿过，有明显褶皱。尽管不影响阅读，但是让人很不满。'

'>> Title: 一般般'
'>> Review: 儒勒·凡尔纳原版小说《海底两万里》，其实可以当做一部海洋百科全书来看的，但不清楚是不是作者或者编辑的原因，相关的海洋生物介绍删减了不少，有些失望，继续在网上逛逛买书吧。。。'

'>> Title: 違う商品でした'
'>> Review: ＋ラブレでない通常のラクトフェリンが送付されてきました。'

'>> Title: がっくし'
'>> Review: 説明書が中国語でした。中国製と表記してほしかったです。'

'>> Title: 別々でもいいかな'
'>> Review: 毎回箸とスプーンを使わないので、洗う手間が増えた気がします。耐熱がスプーンと箸が140度、ケースが100度で食洗機を使う時にケースを入れられないから不便。箸、スプーンとも持ちやすく、持ち心地は問題ないと思います。'


product_category
home                16830
wireless            14697
sports              14235
pc                  13408
kitchen             12003
automotive          11744
drugstore            9220
electronics          8918
shoes                8073
toy                  7691
beauty               7631
apparel              7370
home_improvement     6050
grocery              5779
other                5212
book                 5044
pet_products         4369
camera               3942
office_product       3811
video_games          2314
Name: count, dtype: int64

In [3]:
def filter_books(example):
    return (
        example["product_category"] == "book"
        or example["product_category"] == "digital_ebook_purchase"
    )
chinese_dataset.reset_format()
chinese_dataset.reset_format()
japanese_dataset.reset_format()

ch_books = chinese_dataset.filter(filter_books)
ja_books = japanese_dataset.filter(filter_books)
show_samples(ch_books)

Filter:   0%|          | 0/180000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/180000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]


'>> Title: 比较一般'
'>> Review: 看到一半就猜到结局了，比起心理罪系列逊色不少啊'

'>> Title: 简评'
'>> Review: 这么好的书，竟然给出版社给糟蹋了，一页没有几行，看起来实在不便。出版社考虑经济利益的情况下，能不能为读者考虑考虑?'

'>> Title: 伪作'
'>> Review: 根本不是严歌苓的作品，文字和内容极其粗劣，不忍卒读。强烈抗议无良出版社的无耻行径！'


In [4]:
# 将中文和日文的评论作为单个DatasetDict对象组合起来
#  Datasets 提供了一个方便的 concatenate_datasets() 函数
from datasets import concatenate_datasets,DatasetDict

books_dataset = DatasetDict()

for split in ch_books.keys():
    books_dataset[split] = concatenate_datasets(
        [ch_books[split],ja_books[split]]
    )
    books_dataset[split] = books_dataset[split].shuffle(seed=100)

show_samples(books_dataset)


'>> Title: 素材がよかっただけに残念'
'>> Review: 素材は素晴らしいと思いますが写真がエフェクトがかかっているみたいで何これ？という感じです。なんで初の写真集のようにしなかったのが悔やまれて仕方がありません。'

'>> Title: 书内容破损！'
'>> Review: 第一次在亚马逊买东西，书里面有破损，很失望！'

'>> Title: 小建议'
'>> Review: １、建议发货时最好发带塑料膜包装的新书，因为有很多书是送人或者需要收藏的； ２：包装一定要靠，杜绝发一本就简陋包装的情况，很多单本发的书出现质量问题都是因为包装简陋的原因，谁也不愿意二次退货，麻烦。'


In [5]:
books_dataset = books_dataset.filter(lambda x:x['review_title'] is not None)
books_dataset = books_dataset.map(lambda x:{"title_length":len(x['review_title'])})
books_dataset = books_dataset.filter(lambda x:x['title_length']>2)

Filter:   0%|          | 0/80898 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4477 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4498 [00:00<?, ? examples/s]

Map:   0%|          | 0/80897 [00:00<?, ? examples/s]

Map:   0%|          | 0/4477 [00:00<?, ? examples/s]

Map:   0%|          | 0/4498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/80897 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4477 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4498 [00:00<?, ? examples/s]

In [8]:
books_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category', 'title_length'],
        num_rows: 69862
    })
    test: Dataset({
        features: ['id', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category', 'title_length'],
        num_rows: 3865
    })
    validation: Dataset({
        features: ['id', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category', 'title_length'],
        num_rows: 3903
    })
})

### 模型

这是一种基于 T5 的有趣架构，在文本到文本任务中进行了预训练。在 T5 中，每个 NLP 任务都是以任务前缀（如 summarize: ）的形式定义的，模型根据不同的任务生成不同的文本

In [9]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
inputs = tokenizer("我喜欢读饥饿游戏")
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

['▁', '我', '喜欢', '读', '饥', '饿', '游戏', '</s>']


In [11]:
# 评论和标题的最大长度 因为使用T5模型，评论和标题要一起给，可能会超长
max_input_length = 512
max_target_length = 30


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["review_body"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["review_title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = books_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/69862 [00:00<?, ? examples/s]

Map:   0%|          | 0/3865 [00:00<?, ? examples/s]

Map:   0%|          | 0/3903 [00:00<?, ? examples/s]

### 评估

一种方法是计算两段摘要的重叠单词的数量

ROUGE算法

召回率= 重叠词数量/参考摘要的总词数  召回率越高越好
 
精确度= 重叠词数量/生成摘要总次数

In [12]:
# !pip install rouge_score
# !pip install rouge_chinese
# !pip install jieba

from rouge_chinese import Rouge
import jieba # you can use any other word cutting library

generated_summary = ' '.join(jieba.cut("我特别特别喜欢读饥饿游戏这本书"))
reference_summary = ' '.join(jieba.cut("我喜欢饥饿游戏这本书"))

rouge = Rouge()
scores = rouge.get_scores(generated_summary, reference_summary)
scores

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\tassa\AppData\Local\Temp\jieba.cache
Loading model cost 0.461 seconds.
Prefix dict has been built successfully.


[{'rouge-1': {'r': 1.0, 'p': 0.75, 'f': 0.8571428522448981},
  'rouge-2': {'r': 0.6, 'p': 0.375, 'f': 0.4615384568047337},
  'rouge-l': {'r': 1.0, 'p': 0.6666666666666666, 'f': 0.7999999952000001}}]

In [13]:
from nltk.tokenize import sent_tokenize
import nltk
nltk.download("punkt")

def three_sentence_summary(text):
    print(sent_tokenize(text))
    return "\n".join(sent_tokenize(text)[:3])

print(three_sentence_summary("I grew up reading Koontz, and years ago, I stopped,convinced i had outgrown him. Still,when a friend was looking for something suspenseful too read, I suggested Koontz. She found Strangers."))

['I grew up reading Koontz, and years ago, I stopped,convinced i had outgrown him.', 'Still,when a friend was looking for something suspenseful too read, I suggested Koontz.', 'She found Strangers.']
I grew up reading Koontz, and years ago, I stopped,convinced i had outgrown him.
Still,when a friend was looking for something suspenseful too read, I suggested Koontz.
She found Strangers.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tassa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# !pip install nltk
import nltk
# 使用NLTK的英文句子分割
from nltk.tokenize import sent_tokenize
import re
from typing import List
nltk.download("punkt_tab")  # 下载英文句子分割模型
nltk.download("punkt")      # 确保基础分词模型已安装

def chinese_sent_tokenize(text: str) -> List[str]:
    """自定义中文句子分割函数"""
    # 识别中文句子边界的正则表达式
    sentence_endings = r'[。。，，！？…\.\?!?](?:\s|$)'
    sentences = re.split(sentence_endings, text)
    # 过滤空句子并添加句号
    sentences = [s.strip() + '\n' for s in sentences if s.strip()]
    return sentences

def three_sentence_summary(text: str, language: str = "chinese") -> str:
    """提取文本的前三句作为摘要，支持中英文"""
    if language == "chinese":
        sentences = chinese_sent_tokenize(text)
    else:
        sentences = sent_tokenize(text)
    
    # 提取前三句并拼接
    return "\n".join(sentences[:3])

chinese_text = "实在是搞不清楚这本书是太好还是太糟了，也搞不清楚是原作者的文笔就是这么跳跃，还是翻译的过程中丢失了什么隐含含义。 每个字都认识，每个逗号之前还都懂，到了句号经常就无法理解了，更不要提段落和章节了。 整本书读完的感受是，我得等什么时间这本书的内容再从我脑海里自己蹦出来的时候再去读一遍试试。"
print(three_sentence_summary(chinese_text, language="chinese"))

# # 如果需要处理英文文本
# english_text = "This book is amazing. I couldn't put it down. The author's style is brilliant."
# print(three_sentence_summary(english_text, language="english"))    

实在是搞不清楚这本书是太好还是太糟了，也搞不清楚是原作者的文笔就是这么跳跃，还是翻译的过程中丢失了什么隐含含义

每个字都认识，每个逗号之前还都懂，到了句号经常就无法理解了，更不要提段落和章节了

整本书读完的感受是，我得等什么时间这本书的内容再从我脑海里自己蹦出来的时候再去读一遍试试



[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tassa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tassa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [52]:
def evaluate_baseline(dataset, rouge):
    summaries = [three_sentence_summary(text, language="chinese") for text in dataset["review_body"]]
    references = [' '.join(jieba.cut(title)) for title in dataset["review_title"]]
    summaries = [' '.join(jieba.cut(summary)) for summary in summaries]
    return rouge.get_scores(summaries, references)

# 来计算验证集上的 ROUGE 分数

rouge_scores = evaluate_baseline(books_dataset['validation'],rouge)


In [53]:
rouge_scores

[{'rouge-1': {'r': 0.75, 'p': 0.15789473684210525, 'f': 0.26086956234404535},
  'rouge-2': {'r': 0.6666666666666666, 'p': 0.1, 'f': 0.1739130412098299},
  'rouge-l': {'r': 0.75, 'p': 0.14285714285714285, 'f': 0.23999999731200003}},
 {'rouge-1': {'r': 1.0, 'p': 0.3076923076923077, 'f': 0.4705882316955018},
  'rouge-2': {'r': 1.0, 'p': 0.21428571428571427, 'f': 0.3529411735640139},
  'rouge-l': {'r': 1.0, 'p': 0.26666666666666666, 'f': 0.4210526282548477}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}},
 {'rouge-1': {'r': 1.0, 'p': 0.4375, 'f': 0.6086956479395086},
  'rouge-2': {'r': 1.0, 'p': 0.4, 'f': 0.5714285673469389},
  'rouge-l': {'r': 1.0, 'p': 0.4375, 'f': 0.6086956479395086}},
 {'rouge-1': {'r': 0.36363636363636365,
   'p': 0.3076923076923077,
   'f': 0.3333333283680556},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.36363636363636365,
   'p': 0.26666666666666666,
   'f'

In [51]:
rouge.get_scores(['写的内容很好，但印制质量较差，后面有几页印花了，只是勉强可以看清\n', '还好的一本书，以一个纪录片的形式进行叙述，还可以\n']
                 ,['内容 很 好 ！', '还好 的 一 本书'])

[{'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}}]