In [1]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
from datasets import DatasetDict, Dataset
import os


# 数据集字典
dataset_dict = DatasetDict()
# 定义双语数据集路径
data_files = {
    "en-zh": "../xfdata/多语言机器翻译挑战赛数据集更新（以此测试集提交得分为准）/test/中文/en-zh.txt",
    "de-zh": "../xfdata/多语言机器翻译挑战赛数据集更新（以此测试集提交得分为准）/test/中文/de-zh.txt",
    "ru-zh": "../xfdata/多语言机器翻译挑战赛数据集更新（以此测试集提交得分为准）/test/中文/ru-zh.txt",
    "es-zh": "../xfdata/多语言机器翻译挑战赛数据集更新（以此测试集提交得分为准）/test/中文/es-zh.txt",
    "ja-zh": "../xfdata/多语言机器翻译挑战赛数据集更新（以此测试集提交得分为准）/test/中文/ja-zh.txt",
    "kk-zh": "../xfdata/多语言机器翻译挑战赛数据集更新（以此测试集提交得分为准）/test/中文/kk-zh.txt",
}
# 加载分词器
tokenizer = PreTrainedTokenizerFast.from_pretrained("../user_data/bart_tokenizer")

for target_lang in ["en", "de", "ru", "es", "ja", "kk"]:
    # 读取文件，构造数据列表
    data = []
    with open(data_files[f"{target_lang}-zh"], "r", encoding="utf-8") as f:
        for line in f:
            source_text = line.strip()
            # 添加特殊token，确保 source 是中文
            source_text = f"<zh> {source_text} </s>"
            data.append({"source": source_text})
    # 创建Dataset对象
    dataset = Dataset.from_list(data)
    dataset_dict[f"{target_lang}-zh"] = dataset

# Tokenize 函数
def tokenize_function(examples):
    source_texts = examples["source"]
    # Tokenize source texts
    model_inputs = tokenizer(source_texts, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    # 转换成字典格式，便于 datasets 库使用
    return {key: value.tolist() for key, value in model_inputs.items()}

tokenized_dataset_dict = dataset_dict.map(tokenize_function)

  from .autonotebook import tqdm as notebook_tqdm
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Map: 100%|██████████| 500/500 [00:00<00:00, 3153.98 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 3154.19 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 3213.74 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 3215.46 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 3240.13 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 3077.80 examples/s]


In [2]:
# 设置参数
batch_size = 32
beam_size = 8
# 定义翻译函数
def translate_batch(batch):
    inputs = tokenizer(batch['source'], return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    # 去掉token_type_ids，只保留input_ids和attention_mask
    inputs = {key: inputs[key] for key in ['input_ids', 'attention_mask']}
    translated_tokens = model.generate(
        **inputs,
        max_length=128,
        num_beams=beam_size,
        early_stopping=True,
        decoder_start_token_id=model.config.bos_token_id,  # 使用起始标记
    )
    translated_texts = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return translated_texts

In [3]:
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader


device = "cuda" if torch.cuda.is_available() else "cpu"
print("en-zh")
dataloader = DataLoader(dataset_dict["en-zh"], batch_size=100)
model = BartForConditionalGeneration.from_pretrained("../user_data/step1/en/results/checkpoint-154690").eval().to(device)
print("model is done!")

predictions = []
for batch in tqdm(dataloader):
    translated_texts = translate_batch(batch)
    predictions.extend(translated_texts)
with open("../prediction_result/submit/en-zh.txt", "w", encoding="utf-8") as f:
    for line in predictions:
        f.write(f"{line}\n")
del model

en-zh
model is done!


100%|██████████| 5/5 [01:18<00:00, 15.62s/it]


In [13]:
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader


device = "cuda" if torch.cuda.is_available() else "cpu"
print("en-zh")
dataloader = DataLoader(dataset_dict["de-zh"], batch_size=100)
model = BartForConditionalGeneration.from_pretrained("../user_data/step1/de/results/checkpoint-15470").eval().to(device)
print("model is done!")

predictions = []
for batch in tqdm(dataloader):
    translated_texts = translate_batch(batch)
    predictions.extend(translated_texts)
with open("../prediction_result/submit/de-zh.txt", "w", encoding="utf-8") as f:
    for line in predictions:
        f.write(f"{line}\n")
del model

en-zh
model is done!


100%|██████████| 5/5 [00:42<00:00,  8.50s/it]


In [14]:
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader


device = "cuda" if torch.cuda.is_available() else "cpu"
print("en-zh")
dataloader = DataLoader(dataset_dict["es-zh"], batch_size=100)
model = BartForConditionalGeneration.from_pretrained("../user_data/step1/es/results/checkpoint-15470").eval().to(device)
print("model is done!")

predictions = []
for batch in tqdm(dataloader):
    translated_texts = translate_batch(batch)
    predictions.extend(translated_texts)
with open("../prediction_result/submit/es-zh.txt", "w", encoding="utf-8") as f:
    for line in predictions:
        f.write(f"{line}\n")
del model

en-zh
model is done!


100%|██████████| 5/5 [00:40<00:00,  8.18s/it]


In [15]:
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader


device = "cuda" if torch.cuda.is_available() else "cpu"
print("ja-zh")
dataloader = DataLoader(dataset_dict["ja-zh"], batch_size=100)
model = BartForConditionalGeneration.from_pretrained("../user_data/step1/ja/results/checkpoint-120").eval().to(device)
print("model is done!")

predictions = []
for batch in tqdm(dataloader):
    translated_texts = translate_batch(batch)
    predictions.extend(translated_texts)
with open("../prediction_result/submit/ja-zh.txt", "w", encoding="utf-8") as f:
    for line in predictions:
        f.write(f"{line}\n")
del model

ja-zh
model is done!


100%|██████████| 5/5 [00:34<00:00,  6.81s/it]


In [16]:
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader


device = "cuda" if torch.cuda.is_available() else "cpu"
print("kk-zh")
dataloader = DataLoader(dataset_dict["kk-zh"], batch_size=100)
model = BartForConditionalGeneration.from_pretrained("../user_data/step1/kk/results/checkpoint-120").eval().to(device)
print("model is done!")

predictions = []
for batch in tqdm(dataloader):
    translated_texts = translate_batch(batch)
    predictions.extend(translated_texts)
with open("../prediction_result/submit/kk-zh.txt", "w", encoding="utf-8") as f:
    for line in predictions:
        f.write(f"{line}\n")
del model

kk-zh
model is done!


100%|██████████| 5/5 [00:44<00:00,  8.82s/it]


In [17]:
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader


device = "cuda" if torch.cuda.is_available() else "cpu"
print("ru-zh")
dataloader = DataLoader(dataset_dict["ru-zh"], batch_size=100)
model = BartForConditionalGeneration.from_pretrained("../user_data/step1/ru/results/checkpoint-30940").eval().to(device)
print("model is done!")

predictions = []
for batch in tqdm(dataloader):
    translated_texts = translate_batch(batch)
    predictions.extend(translated_texts)
with open("../prediction_result/submit/ru-zh.txt", "w", encoding="utf-8") as f:
    for line in predictions:
        f.write(f"{line}\n")
del model

ru-zh
model is done!


100%|██████████| 5/5 [00:40<00:00,  8.10s/it]


In [18]:
# Reading and processing each file by stripping each line and saving the results
file_paths = [
    "../prediction_result/submit/en-zh.txt",
    "../prediction_result/submit/ru-zh.txt",
    "../prediction_result/submit/ja-zh.txt",
    "../prediction_result/submit/kk-zh.txt",
    "../prediction_result/submit/de-zh.txt",
    "../prediction_result/submit/es-zh.txt"
]

for file_path in file_paths:
    # Read the file, strip each line, and save it back
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    
    stripped_lines = [line.strip() for line in lines]
    
    # Save the stripped lines to a new file (or overwrite the original if needed)
    with open(file_path, "w", encoding="utf-8") as file:
        file.write("\n".join(stripped_lines))