In [1]:
# 基础绘图库
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
# 各种细节配置如 文字大小，图例文字等杂项
large = 22; med = 16; small = 12
params = {
    'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")
plt.rc('font', **{
    'family': 'Microsoft YaHei, SimHei'})  # 设置中文字体的支持
# sns.set(font='SimHei')  # 解决Seaborn中文显示问题，但会自动添加背景灰色网格
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题

# ===================== 热力图 ====================
def heatmap(data, method='pearson', camp='RdYlGn', figsize=(10 ,8), ax=None):
    """
    data: 整份数据
    method：默认为 pearson 系数
    camp：默认为：RdYlGn-红黄蓝；YlGnBu-黄绿蓝；Blues/Greens 也是不错的选择
    figsize: 默认为 10，8
    """
    ## 消除斜对角颜色重复的色块
    #     mask = np.zeros_like(df2.corr())
    #     mask[np.tril_indices_from(mask)] = True
    plt.figure(figsize=figsize, dpi= 80)
    sns.heatmap(data, cmap=camp, \
                center=0, annot=True, ax=ax)
    # 要想实现只是留下对角线一半的效果，括号内的参数可以加上 mask=mask

In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import shap
import torch
import matplotlib.pyplot as plt

In [3]:
tokenizer = AutoTokenizer.from_pretrained("models/opus-mt-en-zh")
model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-mt-en-zh").cpu()


In [4]:
en = ['The invaders spread their language all over the country. ']
tokenized = tokenizer(en[0],padding=False,truncation=True,max_length=512,return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**tokenized)  
cn_sam = tokenizer.batch_decode(outputs)

In [5]:
def tokenize_by_index(tokenizer, seq, index=None, no_flat=False):
    seq = seq.split(' ')   # seq already being splited
    tokens_ids = [[tokenizer.bos_token_id]]
    for i,ele in enumerate(seq):
        if i:    tokens_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(' '+ele)))
        else:    tokens_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(ele)))
    tokens_ids.append([tokenizer.eos_token_id])

    if not index==None:
        i_s = 0     #start index of target word
        for i, ele in enumerate(tokens_ids):
            i_e = i_s+len(ele)    #end index of target word
            if i == index+1:
                if not no_flat:
                    tokens_ids = sum(tokens_ids, [])  # return a flat ids list
                return tokens_ids, [i_s, i_e]
            i_s += len(ele)
    
    if not no_flat:
        tokens_ids = sum(tokens_ids, [])  # return a flat ids list
    return tokens_ids

In [6]:
en = ['The invaders spread their language all over the country.']
en = ['The drug smugglers landed the heroin on the beach of the island.']
#en = ['The cancer cells are attacking his liver.	']
index = 3
_, idx = tokenize_by_index(tokenizer, en[0], index=index)

word = tokenizer.decode(_[idx[0]])
display(word)
explainer = shap.Explainer(model, tokenizer)
shap_values = explainer(en, fixed_context=1)

idx = idx[0]-1
shap_atten = shap_values.values[0]
cor_idx = shap_atten[idx].argmax()

tokenizer = AutoTokenizer.from_pretrained("models/opus-mt-en-zh")
model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-mt-en-zh")
tokenized = tokenizer(en[0],padding=False,truncation=True,max_length=512,return_tensors="pt")
outputs = model.generate(**tokenized)  
cn_sam = tokenizer.batch_decode(outputs, skip_special_tokens=True)
display(cn_sam)
display(outputs)
cor_cn_sam = tokenizer.decode(outputs[0][cor_idx+1], skip_special_tokens=True)
display(cor_cn_sam)
#display(shap_values)
shap.plots.text(shap_values)
#heatmap(shap_values.values[0])


'landed'

['毒品走私者把海洛因落在岛上的海滩上。']

tensor([[65000,     8,  4189, 14403,   369,   514, 24778, 26623, 21880,    12,
         35071,   158,    10,     0]])

'把'

Unnamed: 0_level_0,The,drug,smugglers,landed,the,heroin,on,the,beach,of,the,island,.,Unnamed: 14_level_0
Unnamed: 0_level_1,0.728,0.116,0.731,-0.158,-0.214,-0.086,-0.193,-0.131,-0.147,-0.119,-0.098,-0.042,-0.047,0.0
毒品,1.032,7.603,0.146,1.238,0.973,1.359,1.09,1.064,0.151,0.052,-0.004,0.173,0.321,-0.0
走私,-1.081,-0.471,10.456,-0.348,-0.483,-0.073,-0.202,-0.354,-0.03,0.071,0.007,0.126,0.202,-0.0
者,-0.196,-0.37,4.363,-0.231,-0.379,0.105,-0.407,-0.308,-0.057,0.026,-0.074,0.128,0.066,-0.0
把,-0.58,-0.598,-0.378,5.636,1.385,1.907,0.69,-0.255,0.124,0.181,0.282,0.435,0.438,-0.0
海洛因,-0.463,-0.275,-0.622,2.827,-1.236,11.595,0.351,-0.319,-0.175,-0.165,-0.268,-0.21,0.022,-0.0
落在,0.237,0.074,0.802,3.872,0.207,0.354,1.305,0.382,0.092,-0.089,0.017,-0.007,0.158,0.0
岛上,-0.114,-0.159,0.06,-0.619,-0.129,0.999,-1.109,0.513,-2.66,2.265,1.378,8.246,-0.049,0.0
的,0.074,-0.092,0.11,0.022,-0.224,-0.093,-0.045,-0.035,2.465,0.482,-0.317,0.321,-0.238,-0.0
海滩,-0.288,-0.318,-0.064,0.17,-0.224,-0.151,-0.616,0.092,8.536,0.468,0.643,1.396,0.052,0.0
上,0.029,-0.017,0.131,-0.232,-0.106,-0.104,0.212,0.049,-0.165,-0.218,-0.016,0.575,0.544,0.0
。,0.458,-0.142,0.465,0.047,-0.091,-0.94,0.153,-0.522,-0.039,-0.394,-0.385,0.407,2.053,-0.0


In [7]:
zh_tokenizer = AutoTokenizer.from_pretrained("models/opus-mt-zh-en")
zh_model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-mt-zh-en")

In [8]:
zh = ['入侵者在全国各地散布他们的语言。']
zh_idx = cor_idx
tokenized = zh_tokenizer(zh[0], padding=True,truncation=True,max_length=512,return_tensors="pt")

display(tokenized)
display(zh_idx)
word = zh_tokenizer.decode(tokenized['input_ids'][0][zh_idx])
display(word)
zh_explainer = shap.Explainer(zh_model, zh_tokenizer)
shap_values = zh_explainer(zh, fixed_context=1)

idx = zh_idx
shap_atten = shap_values.values[0]
cor_zh_idx = shap_atten[idx].argmax()

zh_tokenizer = AutoTokenizer.from_pretrained("models/opus-mt-zh-en")
zh_model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-mt-zh-en")
tokenized = zh_tokenizer(zh[0],padding=False,truncation=True,max_length=512,return_tensors="pt")
outputs = zh_model.generate(**tokenized)  
en_sam = zh_tokenizer.batch_decode(outputs, skip_special_tokens=True)
display(en_sam)
display(outputs)
cor_en_sam = zh_tokenizer.decode(outputs[0][cor_zh_idx+1], skip_special_tokens=True)
display(cor_en_sam)

shap.plots.text(shap_values)

{'input_ids': tensor([[    7,  9996,   369, 30055, 32931,  2580,  3415,     9,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

4

'散布'

['The invaders spread their language throughout the country.']

tensor([[65000,    24, 54233,  2133,  6401,    61,  2650,  2146,     3,   210,
             5,     0]])

'spread'

Unnamed: 0_level_0,Unnamed: 1_level_0,入侵,者,在全国各地,散布,他们的,语言,。,Unnamed: 9_level_0
The,1.434,1.49,0.778,0.838,1.012,-0.185,-0.018,0.162,-0.27
invad,-0.575,9.866,2.344,-0.511,0.3,0.076,0.254,0.032,-0.135
ers,-0.589,-0.681,3.59,-0.015,0.252,0.403,0.035,0.078,-0.027
spread,0.608,1.518,0.415,0.455,6.148,0.11,0.646,-0.218,-0.033
their,-0.064,0.001,0.08,0.203,0.998,2.372,1.083,-0.172,-0.22
language,-0.1,0.35,-0.026,-0.386,-1.21,-0.418,9.884,-1.261,-0.978
throughout,-0.809,-0.683,-0.594,8.281,-0.028,-0.208,0.391,0.197,-0.708
the,-0.188,-0.176,-0.184,1.989,0.043,-0.02,0.089,0.17,-0.084
country,-0.003,-0.076,-0.214,3.726,0.087,0.061,0.241,0.301,-0.213
.,-0.073,-0.018,-0.121,0.219,-0.069,-0.133,-0.126,1.999,-0.173


In [9]:
import pandas as pd
meta = pd.read_csv('data/MOH-X/opus/bt_meta.tsv', delimiter='\t' )
meta

Unnamed: 0,target,idx,en,cn_target,cn_idx,cn,bt_target,bt_idx,bt,pass_bt
0,spread,2,The invaders spread their language all over th...,散布,4,入侵者在全国各地散布他们的语言。,spread,3,The invaders spread their language throughout ...,1
1,climbing,2,Sales were climbing after prices were lowered.,上升,6,价格下跌后销售额上升。,rose,1,Sales rose after prices fell.,0
2,glared,2,The moon glared back at itself from the lake's...,闪,8,月亮从湖面的表面闪闪发光,shine,2,The moon shines from the surface of the lake.,0
3,attacking,4,The cancer cells are attacking his liver.,攻击,4,癌细胞正在攻击他的肝脏,attacking,3,Cancer cells are attacking his liver.,1
4,attacked,7,The editors of the left- leaning paper attacke...,攻击,6,左倾报纸的编辑攻击了新上议院议长。,attacked,8,The editor of the left-hand newspaper attacked...,1
...,...,...,...,...,...,...,...,...,...,...
310,boost,4,The tax cut will boost the economy.,刺激,4,减税将刺激经济。,stimulate,3,Tax cuts will stimulate the economy.,0
311,melting,4,Hundreds of actors were melting into the scene.,融,8,成百上千的演员正在融化 进入现场。,melting,4,Hundreds of actors are melting into the scene.,1
312,wrestled,1,He wrestled all his life with his feeling of i...,摔,7,他一生都带着自卑的感觉摔跤,wrestling,4,He's been wrestling his whole life with a low ...,0
313,showered,2,He was showered with praise.,洗,5,他被赞美地洗了澡。,and,5,He was glorified and bathed.,0


In [10]:
liter = pd.read_csv('data/MOH-X/opus/bt_liter.tsv', delimiter='\t' )
liter

Unnamed: 0,target,idx,en,cn_target,cn_idx,cn,bt_target,bt_idx,bt,pass_bt
0,ruined,2,You have ruined my car by pouring sugar in the...,毁了,8,你把糖倒进罐子里 毁了我的车!,my,9,You poured sugar in a can and ruined my car!,0
1,tack,0,tack the notice on the board.,把,0,把通知写在黑板上,Write,0,Write the notice on the blackboard.,0
2,crawl.,12,European children learn the breast stroke; the...,爬,12,欧洲儿童学会了乳房中风;他们往往不会爬行。,climb,11,European children learn breast strokes; they o...,0
3,guard,1,Please guard my possessions while I'm away.,保护,4,请在我离开的时候保护我的财产,protect,1,Please protect my property while I'm gone.,0
4,spread.,2,the infection spread.,传播,2,感染传播。,spread,3,Incidence spreads.,1
...,...,...,...,...,...,...,...,...,...,...
327,grabbed,1,She grabbed the child's hand and ran out of th...,抓,1,她抓着孩子的手跑出房间,holding,6,She ran out of the room holding the child's hand.,0
328,spread,0,spread cheese on a piece of bread.,撒,2,把奶酪撒在面包上,cheese,2,Throw the cheese on the bread.,0
329,landed,3,The drug smugglers landed the heroin on the be...,把,4,毒品走私者把海洛因落在岛上的海滩上。,left,2,Drug smugglers left heroin on the island's bea...,0
330,ruffled,2,The wind ruffled the surface of the water.,动,3,风吹动了水的表面,touched,2,The wind touched the surface of the water.,0


In [11]:
display(liter.loc[329])
liter.loc[329][2]

target                                                  landed
idx                                                          3
en           The drug smugglers landed the heroin on the be...
cn_target                                                    把
cn_idx                                                       4
cn                                          毒品走私者把海洛因落在岛上的海滩上。
bt_target                                                 left
bt_idx                                                       2
bt           Drug smugglers left heroin on the island's bea...
pass_bt                                                      0
Name: 329, dtype: object

'The drug smugglers landed the heroin on the beach of the island.'

In [12]:
tokenizer = AutoTokenizer.from_pretrained("models/opus-mt-en-zh")
model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-mt-en-zh").cpu()

tokenized = tokenizer(en[0],padding=True,truncation=True,max_length=512,return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**tokenized)
display(outputs)
cn_sam = tokenizer.batch_decode(outputs, skip_special_tokens=True)
cn_sam

tensor([[65000,     8,  4189, 14403,   369,   514, 24778, 26623, 21880,    12,
         35071,   158,    10,     0]])

['毒品走私者把海洛因落在岛上的海滩上。']

In [13]:
import os
import csv
class InputBTExample(object):
    def __init__(
        self, guid, t, idx, en, zh_t, zh_idx, zh, bt_t, bt_idx, bt, pass_bt
    ):
        self.guid = guid
        self.en = en
        self.idx = idx
        self.t = t
        self.zh = zh
        self.zh_idx = zh_idx
        self.zh_t = zh_t
        self.bt = bt
        self.bt_idx = bt_idx
        self.bt_t = bt_t
        self.pass_bt = pass_bt
        
class RawData(object):
    def __init__(self, name):
        self.name=name
        self.data={}

    def __getitem__(self, item):
        return self.data[item]

    def __setitem__(self, key, value):
        self.data[key] = value

In [14]:
def _read_opus(data_dir, set_type):
    dataset = []
    file_path = data_dir+'.tsv'
    with open(file_path, encoding='utf8') as f:
        lines = csv.reader(f, delimiter='\t')
        next(lines)
        for i, line in enumerate(lines):
            en = line[2]
            idx = line[1]
            t = line[0]
            zh = line[5]
            zh_idx = line[4]
            zh_t = line[3]
            bt = line[8]
            bt_idx = line[7]
            bt_t = line[6]
            pass_bt = line[9]
            guid = "%s-%s" % (set_type, i)

            dataset.append(
                InputBTExample(guid, t, idx, en, zh_t, zh_idx, zh, bt_t, bt_idx, bt, pass_bt)
                )
    print(file_path, len(dataset))
    return dataset

def load_opus_bt():
    dataset_name = 'mohx'
    data_dir = 'data/MOH-X/opus'
    dataset = RawData(dataset_name)
    meta_path = os.path.join(data_dir, 'bt_meta')
    liter_path = os.path.join(data_dir, 'bt_liter')
    if dataset_name == 'trofi' or 'mohx':
        dataset['meta'] = _read_opus(meta_path, 'meta')
        dataset['liter'] = _read_opus(liter_path, 'liter')
    return dataset

In [15]:
bt_opus = load_opus_bt()

data/MOH-X/opus/bt_meta.tsv 315
data/MOH-X/opus/bt_liter.tsv 332


In [16]:
display(bt_opus['meta'][0].zh)

'入侵者在全国各地散布他们的语言。'

In [66]:
def count_pass(data):
    for key in data.data.keys():
        count = 0
        for sam in data[key]:
            if sam.pass_bt == '1':
                count+=1
        rate = format(count/len(data[key]), '.4f')
        print(f'{count} of {len(data[key])} {key} pass back-translation. rate: {rate}')

In [67]:
count_pass(bt_opus)

44 of 315 meta pass back-translation. rate: 0.1397
66 of 332 liter pass back-translation. rate: 0.1988


In [19]:
def align_shap(seq, index, bt=False):
    if not bt:
        tokenizer = AutoTokenizer.from_pretrained("models/opus-mt-en-zh")
        model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-mt-en-zh")
        _, idx = tokenize_by_index(tokenizer, seq, index=index)
        word = tokenizer.decode(_[idx[0]])
        display(word)
        explainer = shap.Explainer(model, tokenizer)
        shap_values = explainer([seq], fixed_context=1)
        idx = idx[0]-1
        shap_atten = shap_values.values[0]
        cor_idx = shap_atten[idx].argmax()
        
        tokenizer = AutoTokenizer.from_pretrained("models/opus-mt-en-zh")
        model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-mt-en-zh")
    else:
        tokenizer = AutoTokenizer.from_pretrained("models/opus-mt-zh-en")
        model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-mt-zh-en")
        zh_idx = index
        tokenized = tokenizer(seq, padding=True,truncation=True,max_length=512,return_tensors="pt")
        word = tokenizer.decode(tokenized['input_ids'][0][zh_idx])
        display(word)
        explainer = shap.Explainer(model, tokenizer)
        shap_values = explainer([seq], fixed_context=1)
        idx = zh_idx
        shap_atten = shap_values.values[0]
        cor_idx = shap_atten[idx].argmax()

        tokenizer = AutoTokenizer.from_pretrained("models/opus-mt-zh-en")
        model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-mt-zh-en")

    tokenized = tokenizer(seq,padding=False,truncation=True,max_length=512,return_tensors="pt")
    outputs = model.generate(**tokenized)  
    
    sam = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    display(sam)
    cor_t = tokenizer.decode(outputs[0][cor_idx+1], skip_special_tokens=True)
    display(cor_t)
    shap.plots.text(shap_values)
    

In [68]:
sample = bt_opus['meta'][0]
bt = 0
if bt:
    sentence = sample.zh
    index = int(sample.zh_idx)
else:
    sentence = sample.en
    index = int(sample.idx)
align_shap(sentence, index, bt)

'spread'

['入侵者在全国各地散布他们的语言。']

'散布'

Unnamed: 0_level_0,The,invade,rs,spread,their,language,all,over,the,country,.,Unnamed: 12_level_0
Unnamed: 0_level_1,0.65,0.879,-0.063,-0.058,-0.205,-0.138,-0.24,-0.194,-0.197,-0.047,-0.028,-0.0
入侵,1.04,10.593,0.438,0.379,0.098,-0.113,-0.079,0.08,-0.099,0.054,0.11,-0.0
者,0.235,1.546,2.365,-0.016,0.078,-0.029,-0.032,-0.083,-0.075,0.129,0.1,0.0
在全国各地,-0.67,-0.514,0.433,2.1,-0.425,0.732,1.028,3.615,1.193,2.885,0.035,0.0
散布,0.267,0.607,-0.298,4.931,0.397,1.056,0.184,0.579,0.062,-0.292,0.184,-0.0
他们的,0.067,0.482,-0.104,0.446,6.255,0.539,-0.565,-0.372,-0.02,0.099,0.04,0.0
语言,-0.345,0.055,-0.012,-0.189,-0.875,12.011,-0.581,-0.641,-0.368,-0.135,-0.256,0.0
。,1.218,-0.479,0.514,0.717,0.024,-0.776,-1.241,0.8,-0.021,0.287,1.996,0.0


In [21]:
from nltk.translate.bleu_score import sentence_bleu
def bleu_score(reference, candidate, weights=(0,0,0,0)):
    score = sentence_bleu([reference.split()], candidate.split(), weights, smoothing_function=None)
    return score

In [22]:
sample =  bt_opus['meta'][0]
reference = sample.en
candidate = sample.bt
x0 = bleu_score(reference, candidate, (0.25,0.25,0.25,0.25))
x1 = bleu_score(reference, candidate, (1,0,0,0))
display(reference)
display(candidate)
display(x0)
x1

'The invaders spread their language all over the country.'

'The invaders spread their language throughout the country.'

0.5247357977607321

0.772184789761521

In [23]:
def bleu_sum(data, set_name):
    bs1 = 0
    bs4 = 0
    for sam in data:
        reference = sam.en
        candidate = sam.bt
        x0 = bleu_score(reference, candidate, (0.25,0.25,0.25,0.25))
        x1 = bleu_score(reference, candidate, (1,0,0,0))
        bs1 += x1
        bs4 += x0
    sbs1 = format(bs1/len(data), '.3f')
    sbs4 = format(bs4/len(data), '.3f')
    print(F'BLEU-1 score: {sbs1}; BLEU-4 score: {sbs4} for dataset {set_name}')
    return sbs1, sbs4

In [24]:
bleu_sum(bt_opus['meta'], 'metaphor')
bleu_sum(bt_opus['liter'], 'non-meta')

BLEU-1 score: 0.492; BLEU-4 score: 0.135 for dataset metaphor
BLEU-1 score: 0.523; BLEU-4 score: 0.142 for dataset non-meta


('0.523', '0.142')

In [25]:
def sift_pass_bt(data):
    passed = []
    non_pass = []
    for sam in data:
        if sam.pass_bt == '1':
            passed.append(sam)
        else:
            non_pass.append(sam)
    print(f'{len(passed)} passed, {len(non_pass)} no pass.')
    return passed, non_pass

In [26]:
meta_pb, meta_npb = sift_pass_bt(bt_opus['meta'])
liter_pb, liter_npb = sift_pass_bt(bt_opus['liter'])

44 passed, 271 no pass.
66 passed, 266 no pass.


In [43]:
display('passed bt')
bleu_sum(meta_pb, 'metaphor passed bt')
bleu_sum(liter_pb, 'literal passed bt')
display('no passed bt')
bleu_sum(meta_npb, 'metaphor no passed bt')
bleu_sum(liter_npb, 'literal no passed bt')

'passed bt'

BLEU-1 score: 0.697; BLEU-4 score: 0.418 for dataset metaphor passed bt
BLEU-1 score: 0.684; BLEU-4 score: 0.353 for dataset literal passed bt


'no passed bt'

BLEU-1 score: 0.458; BLEU-4 score: 0.090 for dataset metaphor no passed bt
BLEU-1 score: 0.483; BLEU-4 score: 0.089 for dataset literal no passed bt


('0.483', '0.089')

In [44]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/abdn/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [45]:
from nltk.translate.meteor_score import single_meteor_score
def meteor_sum(data, set_name):
    ms = 0
    for sam in data:
        reference = sam.en
        candidate = sam.bt
        x = single_meteor_score(reference.split(), candidate.split())
        ms += x
    sms = format(ms/len(data), '.3f')
    print(F'METEOR score: {sms} for dataset {set_name}')
    return sms
    

In [46]:
meteor_sum(bt_opus['meta'], 'metaphor')
meteor_sum(bt_opus['liter'], 'literal')

METEOR score: 0.532 for dataset metaphor
METEOR score: 0.559 for dataset literal


'0.559'

In [47]:
display('passed bt')
meteor_sum(meta_pb, 'metaphor passed bt')
meteor_sum(liter_pb, 'literal passed bt')
display('no passed bt')
meteor_sum(meta_npb, 'metaphor no passed bt')
meteor_sum(liter_npb, 'literal no passed bt')

'passed bt'

METEOR score: 0.740 for dataset metaphor passed bt
METEOR score: 0.709 for dataset literal passed bt


'no passed bt'

METEOR score: 0.498 for dataset metaphor no passed bt
METEOR score: 0.522 for dataset literal no passed bt


'0.522'

In [54]:
from rouge import Rouge
def rouge_sum(data, set_name, rpf='r'):
    rouger = Rouge()
    rs1 = 0
    rs2 = 0
    rsl = 0
    for sam in data:
        reference = sam.en
        candidate = sam.bt
        x = rouger.get_scores(candidate, reference)
        if rpf == 'r':
            rs1 += x[0]['rouge-1']['r']
            rs2 += x[0]['rouge-2']['r']
            rsl += x[0]['rouge-l']['r']
        elif rpf == 'f1':
            rs1 += x[0]['rouge-1']['f']
            rs2 += x[0]['rouge-2']['f']
            rsl += x[0]['rouge-l']['f']
    
    srs1 = format(rs1/len(data), '.3f')
    srs2 = format(rs2/len(data), '.3f')
    srsl = format(rsl/len(data), '.3f')
    print(F'based on {rpf}, Rouge-1 score: {srs1}, Rouge-2 score: {srs2}, Rouge-L score: {srsl} for dataset {set_name}')
    return srs1, srs2, srsl

In [55]:
rouge_sum(bt_opus['meta'], 'metaphor')
rouge_sum(bt_opus['liter'], 'literal')
rouge_sum(bt_opus['meta'], 'metaphor', 'f1')
rouge_sum(bt_opus['liter'], 'literal', 'f1')

based on r, Rouge-1 score: 0.547, Rouge-2 score: 0.322, Rouge-L score: 0.535 for dataset metaphor
based on r, Rouge-1 score: 0.576, Rouge-2 score: 0.334, Rouge-L score: 0.563 for dataset literal
based on f1, Rouge-1 score: 0.547, Rouge-2 score: 0.320, Rouge-L score: 0.535 for dataset metaphor
based on f1, Rouge-1 score: 0.579, Rouge-2 score: 0.335, Rouge-L score: 0.567 for dataset literal


('0.579', '0.335', '0.567')

In [57]:
display('passed bt')
rouge_sum(meta_pb, 'metaphor passed bt')
rouge_sum(liter_pb, 'literal passed bt')
display('no passed bt')
rouge_sum(meta_npb, 'metaphor no passed bt')
rouge_sum(liter_npb, 'literal no passed bt')

'passed bt'

based on r, Rouge-1 score: 0.755, Rouge-2 score: 0.592, Rouge-L score: 0.737 for dataset metaphor passed bt
based on r, Rouge-1 score: 0.726, Rouge-2 score: 0.527, Rouge-L score: 0.713 for dataset literal passed bt


'no passed bt'

based on r, Rouge-1 score: 0.513, Rouge-2 score: 0.278, Rouge-L score: 0.502 for dataset metaphor no passed bt
based on r, Rouge-1 score: 0.539, Rouge-2 score: 0.286, Rouge-L score: 0.526 for dataset literal no passed bt


('0.539', '0.286', '0.526')

In [38]:
from rouge import Rouge
rouger = Rouge()
sam = bt_opus['meta'][0]
ref = sam.en
display(ref)
can = sam.bt
display(can)
x=rouger.get_scores(can, ref)
x[0]

'The invaders spread their language all over the country.'

'The invaders spread their language throughout the country.'

{'rouge-1': {'r': 0.7777777777777778, 'p': 0.875, 'f': 0.823529406782007},
 'rouge-2': {'r': 0.625, 'p': 0.7142857142857143, 'f': 0.6666666616888889},
 'rouge-l': {'r': 0.7777777777777778, 'p': 0.875, 'f': 0.823529406782007}}

In [32]:
meta_group = meta.groupby('pass_bt')
for passed, group in meta_group:
    display(passed)
    display(group)

0

Unnamed: 0,target,idx,en,cn_target,cn_idx,cn,bt_target,bt_idx,bt,pass_bt
1,climbing,2,Sales were climbing after prices were lowered.,上升,6,价格下跌后销售额上升。,rose,1,Sales rose after prices fell.,0
2,glared,2,The moon glared back at itself from the lake's...,闪,8,月亮从湖面的表面闪闪发光,shine,2,The moon shines from the surface of the lake.,0
5,rotate,1,We rotate the crops so as to maximize the use ...,轮流,1,"我们轮流种植作物,以便最大限度地利用土壤。",rotation,6,Crops are grown on a rotational basis in order...,0
6,injected,1,He injected new life into the performance.,注入,3,他给表演注入了新的生命,breathe,1,He breathed a new life into the show.,0
7,gravitated,2,The conversation gravitated towards politics.,引,2,对话引向政治。,leads,1,Dialogue leads to politics.,0
...,...,...,...,...,...,...,...,...,...,...
309,precipitated,4,The bridge broke and precipitated the train in...,撞,7,"桥断了,把火车撞到下面的河里",hit,7,The bridge was broken and the train hit the ri...,0
310,boost,4,The tax cut will boost the economy.,刺激,4,减税将刺激经济。,stimulate,3,Tax cuts will stimulate the economy.,0
312,wrestled,1,He wrestled all his life with his feeling of i...,摔,7,他一生都带着自卑的感觉摔跤,wrestling,4,He's been wrestling his whole life with a low ...,0
313,showered,2,He was showered with praise.,洗,5,他被赞美地洗了澡。,and,5,He was glorified and bathed.,0


1

Unnamed: 0,target,idx,en,cn_target,cn_idx,cn,bt_target,bt_idx,bt,pass_bt
0,spread,2,The invaders spread their language all over th...,散布,4,入侵者在全国各地散布他们的语言。,spread,3,The invaders spread their language throughout ...,1
3,attacking,4,The cancer cells are attacking his liver.,攻击,4,癌细胞正在攻击他的肝脏,attacking,3,Cancer cells are attacking his liver.,1
4,attacked,7,The editors of the left- leaning paper attacke...,攻击,6,左倾报纸的编辑攻击了新上议院议长。,attacked,8,The editor of the left-hand newspaper attacked...,1
9,poison,0,poison someone's mind.,毒,1,毒害某人的心灵,poison,1,To poison someone's mind.,1
10,abuse,1,Don't abuse the system.,滥用,1,不要滥用系统。,abuse,3,Don't abuse the system.,1
12,hurt,3,Our business was hurt by the new competition.,伤害,8,我们的生意受到了新竞争对手的伤害,hurt,4,Our business has been hurt by the new competitor.,1
21,rotate,3,Interns have to rotate for a few months.,轮调,3,实习生必须轮调几个月。,rotate,3,Interns must rotate for several months.,1
25,salute,1,I salute your courage!,向你,1,我向你的勇气致敬!,salute,1,I salute your courage!,1
32,buy,2,I can't buy this story.,买,1,我不能买这个故事。,buy,4,I can't buy this story.,1
35,interrupted,1,She interrupted her pregnancy.,中断,1,她中断了妊娠。,interrupted,1,She interrupted her pregnancy.,1


In [None]:
liter_group = liter.groupby('pass_bt')
for passed, group in liter_group:
    display(passed)
    display(group)