In [1]:
!nvidia-smi

Tue Aug 24 04:05:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
!pip install jieba transformers sentencepiece



In [8]:
import torch
import re
import jieba.posseg
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

valid_pos = {"n", "vd", "vn", "nr", "ns", "nt", "nz", "j"}

class BackTranslateManager:
    def __init__(self, model_path_zh_en, model_path_en_zh):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer_zh_en = AutoTokenizer.from_pretrained(model_path_zh_en)
        self.model_zh_en = AutoModelForSeq2SeqLM.from_pretrained(model_path_zh_en)
        self.model_zh_en.to(self.device)
        self.model_zh_en.eval()
        self.tokenizer_en_zh = AutoTokenizer.from_pretrained(model_path_en_zh)
        self.model_en_zh = AutoModelForSeq2SeqLM.from_pretrained(model_path_en_zh)
        self.model_en_zh.to(self.device)
        self.model_en_zh.eval()

    def translate_zh_en(self, sentence_zh: str):
        tokenized_text = self.tokenizer_zh_en([sentence_zh], return_tensors='pt')
        tokenized_text = {k: v.to(self.device) for k, v in tokenized_text.items()}
        translation = self.model_zh_en.generate(**tokenized_text)
        translated_text = self.tokenizer_zh_en.batch_decode(translation, skip_special_tokens=True)[0]
        return translated_text

    def translate_en_zh(self, sentence_en: str):
        tokenized_text = self.tokenizer_en_zh([sentence_en], return_tensors='pt')
        tokenized_text = {k: v.to(self.device) for k, v in tokenized_text.items()}
        translation = self.model_en_zh.generate(**tokenized_text, num_return_sequences=5, temperature=0.9, do_sample=True)
        translated_text = self.tokenizer_en_zh.batch_decode(translation, skip_special_tokens=True)
        return translated_text

    def back_translate(self, sentence_zh: str):
        sentence_en = self.translate_zh_en(sentence_zh)
        back_translated_zh = self.translate_en_zh(sentence_en)

        back_translated_zh = [x for x in set(back_translated_zh) if x !=sentence_zh and len("".join(re.findall("[\u4e00-\u9fa5]+", x))) > 5]
        ner_result = [[word, flag] for (word, flag) in jieba.posseg.cut(sentence_zh)]

        result = []
        for x in back_translated_zh:
            not_show_keywords = [word for (word, flag) in ner_result if word not in x and flag in valid_pos]
            if not_show_keywords:
                x += ":" + "_".join(not_show_keywords)
            result.append(x)
        return result


if __name__ == '__main__':
    texts = [
        "韩华将向三星SDI西安工厂供应电池套件",
        "ICinsights：增长19%！三星再次成为全球最大半导体供应商"
    ]

    manager = BackTranslateManager("Helsinki-NLP/opus-mt-zh-en", "Helsinki-NLP/opus-mt-en-zh")
    for sentence_zh in texts:
        back_translated_zh = manager.back_translate(sentence_zh)
        print("----------")
        print(sentence_zh)
        print("\n")
        for x in back_translated_zh:
          print(x)


----------
韩华将向三星SDI西安工厂供应电池套件


韩华会为三星SDIXian工厂提供电池包:西安_供应_套件
汉华会向三星SDIXian工厂提供电池包:韩华_西安_供应_套件
韩华会向三星SDIXian工厂提供电池包:西安_供应_套件
----------
ICinsights：增长19%！三星再次成为全球最大半导体供应商


三星再次成为世界最大的半导体供应商:全球
INISights: 19%的增长!三星再次成为世界上最大的半导体供应商。:全球
INISights: 增加19%!三星再次成为世界上最大的半导体供应商:全球
ININSights: 增加19%!三星再次成为世界上最大的半导体供应商:全球
