# Data Preparing

In [1]:
import os
import csv
import random
import numpy as np
import pandas as pd

In [2]:
situation_list = ['apology','request','thanksgiving']
sen_type_list = ['query','res']
src_type = 'translated' #'translated'
ver_name = '600_culturize_all_both_lenpenalty20_direct'
save_dir = f'data/{ver_name}/'
data_dir = f'data/{ver_name}/'
label_orientation = 'direct'
intense_orientation = 'direct'

In [3]:
def get_data_as_list(path):
    data = []
    with open(path, 'r', encoding='utf-8-sig')as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(row[0])
    return data


In [4]:
def get_datadf(situation_list,sen_type_list,src_type):
    df = pd.DataFrame(columns=["input_text", "target_text"])
    for situation in situation_list:
        for sen_type in sen_type_list:
            for corpus in ['mpdd','cejc']:
                if src_type == 'original': 
                    src_path = f'/nfs/nas-7.1/yamashita/LAB/giza-pp/data/{corpus}/{situation}/{src_type}_{sen_type}.csv'
                elif src_type == 'translated':     
                    src_path = f'/nfs/nas-7.1/yamashita/LAB/giza-pp/data/{corpus}/{situation}/{src_type}_{sen_type}.csv'     
                tgt_path = f'/nfs/nas-7.1/yamashita/LAB/giza-pp/data/{corpus}/{situation}/rewrited_{sen_type}.csv'
                
                src_data = get_data_as_list(src_path)
                tgt_data = get_data_as_list(tgt_path)
                
                tmp_df = pd.DataFrame([src_data,tgt_data],index=['input_text','target_text'],columns=[src_path[40:]]*len(src_data))
                tmp_df = tmp_df.T
                
                tmp_df['prefix'] = f'{situation} {sen_type}'
                
                df = pd.concat([df,tmp_df])
    df = df.reset_index().set_axis(['fname','input_text','target_text','prefix'],axis=1)
    return df


In [5]:
# tgt_list = ['ja','zh']
os.makedirs(save_dir, exist_ok=True)

data_df = get_datadf(situation_list,sen_type_list,src_type)

pureidx = np.arange(len(data_df))
val_idx = pureidx[5::10]
test_idx = pureidx[::10]

ind = np.ones(len(data_df), dtype=bool)
ind[val_idx] = False
ind[test_idx] = False
train_idx = pureidx[ind]
# print(len(data_df))
# print(train_idx.shape)
# print(test_idx.shape)
# print(val_idx.shape)

train_df = data_df.iloc[train_idx]
val_df = data_df.iloc[val_idx]
test_df = data_df.iloc[test_idx]

train_df.to_csv(save_dir+'train.csv', index=None, encoding='utf_8_sig')
val_df.to_csv(save_dir+'val.csv', index=None, encoding='utf_8_sig')
test_df.to_csv(save_dir+'test.csv', index=None, encoding='utf_8_sig')


# Finetune

In [6]:
# import logging
# import pandas as pd
# from simpletransformers.t5 import T5Model, T5Args

# logging.basicConfig(level=logging.INFO)
# transformers_logger = logging.getLogger("transformers")
# transformers_logger.setLevel(logging.WARNING)
# # 
# data_dir = f'data/{ver_name}/'
# train_df = pd.read_csv(f"{data_dir}train.csv").astype(str)
# eval_df = pd.read_csv(f"{data_dir}val.csv").astype(str)
# # train_df["prefix"] = ""
# # eval_df["prefix"] = ""
# train_df


In [7]:
# model_args = T5Args()
# model_args.length_penalty = 20
# model_args.max_seq_length = 256
# model_args.train_batch_size = 4
# model_args.eval_batch_size = 4
# model_args.num_train_epochs = 20
# model_args.evaluate_during_training = True
# model_args.evaluate_during_training_steps = 500
# model_args.use_multiprocessing = False
# model_args.fp16 = False
# model_args.early_stopping_metric = 'eval_loss'
# model_args.early_stopping_metric_minimize = True
# model_args.early_stopping_patience = 3
# model_args.use_early_stopping = True
# model_args.save_eval_checkpoints = True
# model_args.save_eval_checkpoints = False
# model_args.learning_rate = 3e-5
# model_args.best_model_dir = f'outputs/{ver_name}/best_model/'
# model_args.output_dir = f'outputs/{ver_name}/ckpt/'
# model_args.save_model_every_epoch = True
# model_args.save_steps = -1
# model_args.no_cache = True
# model_args.reprocess_input_data = True
# model_args.overwrite_output_dir = True
# model_args.preprocess_inputs = False
# model_args.num_return_sequences = 1
# model_args.wandb_project = ver_name

# model = T5Model("mt5", "google/mt5-base", args=model_args, cuda_device=1)
# # Train the model
# os.environ['WANDB_CONSOLE'] = 'off'
# model.train_model(train_df[['prefix','input_text','target_text']], eval_data=eval_df[['prefix','input_text','target_text']])


# Finetune with culturize label prefix

In [8]:
labeled_table_paths = ['JIWC_diff_reason_table.csv', 'CLIWC_diff_reason_table.csv']

# label_orientation_list = ["direct","intense","intense","intense","perspective"]
# intense_orientation_list =['','all','downgrader','specific','']

ja_sig_list=[   ['del','cejc','query','request','Trust'],
                ['del','cejc','query','thanksgiving','Trust'],
                ['del','cejc','res','request','Trust'],
                ['add','mpdd','query','apology','Disgust'],
                ['add','mpdd','query','request','Sadness'], 
                ['add','mpdd','query','request','Disgust'],
                ['add','mpdd','query','request','Joy'],
                ['add','mpdd','query','thanksgiving','Sadness'],
                ['add','mpdd','query','thanksgiving','Disgust'],
                ['add','mpdd','query','thanksgiving','Trust'], 
                ['add','mpdd','query','thanksgiving','Joy'],
                ['add','mpdd','res','request','Sadness'],
                ['add','mpdd','res','request','Disgust'],
                ['add','mpdd','res','request','Trust'],
                ['add','mpdd','res','request','Joy'],
                ['add','mpdd','res','thanksgiving','Sadness']]
zh_sig_list=[   ['del',	'mpdd',	'query',	'request',		'affect'],
                ['del',	'mpdd',	'query',	'request',		'negemo'],
                ['del',	'mpdd',	'query',	'request',		'anger'],
                ['del',	'mpdd',	'res',	'thanksgiving',	'affect'],
                ['add',	'cejc',	'query',	'apology',	    'affect'],
                ['add',	'cejc',	'query',	'apology',	    'posemo'],
                ['add',	'cejc',	'query',	'apology',	    'negemo'],
                ['add',	'cejc',	'query',	'apology',	    'anger'],
                ['add',	'cejc',	'query',	'request',	    'negemo'],
                ['add',	'cejc',	'res',	'request',	    'affect'],
                ['add',	'cejc',	'res',	'request',	    'posemo']]

MT_data_list,HT_data_list,prefix_list,columns_list = [],[],[],[]
for labeled_table_path in labeled_table_paths:
    columns_name=['diff_type','corpus','situation','sen_type','emotion','word','htmt','line','part','effect','direct','intense','perspective']
    df = pd.read_csv(f'/nfs/nas-7.1/yamashita/LAB/giza-pp/sentiment_analysis/{labeled_table_path}', names=columns_name)
    
    if label_orientation == "intense" and intense_orientation == "all":
        more =   ['lessdowngrader','moreupgrader','morespecific','lessrespectful','lesshumble','add_expect_sth_in_return','add_irony']
        less = ['moredowngrader','lessspecific','lessupgrader','morerespectful','morehumble','rmv_expect_sth_in_return','rmv_irony']
        for m, l in zip(more, less):
            df=df.replace(m,'moreintense')
            df=df.replace(l,'lessintense')
    elif label_orientation == "intense":
        pass
    
    if labeled_table_path == 'JIWC_diff_reason_table.csv':
        sig_list = ja_sig_list
    else:
        sig_list = zh_sig_list
        
    for s in sig_list:
        diff_type=s[0]
        corpus=s[1]
        sen_type=s[2]
        situation=s[3]
        emotion=s[4]
        # FILTER TABLE
        df = df.dropna(subset=[label_orientation])
        emo_cond = df['diff_type'].isin([diff_type]) & df['corpus'].isin([corpus]) & df['sen_type'].isin([sen_type]) & df['situation'].isin([situation]) & df['emotion'].isin([emotion]) & df['htmt'].isin(['HT'])
        gizamiss_cond = df['part'].isin(['gizamiss','labelmiss'])
        line_list = df[emo_cond&~gizamiss_cond]['line'].to_list()
        label_list = df[emo_cond&~gizamiss_cond][label_orientation].to_list()
#         print(label_list)
        # GET DATA
        MT_path = f'/nfs/nas-7.1/yamashita/LAB/giza-pp/data/{corpus}/{situation}/translated_{sen_type}.csv'
        HT_path = f'/nfs/nas-7.1/yamashita/LAB/giza-pp/data/{corpus}/{situation}/rewrited_{sen_type}.csv'
        MT_data = get_data_as_list(MT_path)
        HT_data = get_data_as_list(HT_path)

        for line,label in zip(line_list,label_list):
            MT_data_list.append(MT_data[line])
            HT_data_list.append(HT_data[line])
            prefix_list.append(label)
            columns_list.append(MT_path[40:])
# print(MT_data_list)
tmp_df = pd.DataFrame([prefix_list,MT_data_list,HT_data_list],index=['prefix','input_text','target_text'],columns=columns_list)
tmp_df = tmp_df.T
data_df = tmp_df.drop_duplicates() 
data_df = data_df.reset_index().set_axis(['fname','prefix','input_text','target_text',],axis=1)
display(data_df)
pureidx = np.arange(len(data_df))
val_idx = pureidx[5::10]
test_idx = pureidx[::10]

ind = np.ones(len(data_df), dtype=bool)
ind[val_idx] = False
ind[test_idx] = False
train_idx = pureidx[ind]
# print(len(data_df))
# print(train_idx.shape)
# print(test_idx.shape)
# print(val_idx.shape)

train_df = data_df.iloc[train_idx]
val_df = data_df.iloc[val_idx]
test_df = data_df.iloc[test_idx]

train_df.to_csv(save_dir+'train.csv', encoding='utf_8_sig')
val_df.to_csv(save_dir+'val.csv', encoding='utf_8_sig')
test_df.to_csv(save_dir+'test.csv', encoding='utf_8_sig')
# display(test_df)

Unnamed: 0,fname,prefix,input_text,target_text
0,cejc/request/translated_query.csv,moredirect,這樣的話，+如果我不在現場，耀世賣了，+也許我可以給耀世一些保證金。,如果真的要把工作交給耀西的話...能不能給他好一點的利潤啊？
1,cejc/request/translated_query.csv,lessdirect,是的，我知道。還有奶酪棒，謝謝。,好。那我要點一份炸起司條。
2,cejc/thanksgiving/translated_query.csv,moredirect,是的，先生。。謝謝你。。是的，我知道。對不起，我不知道。謝謝你。。好吧，那就+今天這款打九折...,好的。這裡為您結帳。今天打9折之後總共是800元。這裡收您1000元，請問您有本店的集點卡嗎？
3,cejc/thanksgiving/translated_query.csv,moredirect,我明白了。好的，先生。。那麼需要多長時間呢？。冒險課程和天幕課程。。是的，我知道。啊。。好吧...,原來如此，我知道了。那請問一下森林探險行程和露營行程差不多會花多少時間呢？好的，啊...這樣...
4,mpdd/apology/translated_query.csv,lessdirect,ごめんね！ ここ数日、家では色々あったんですが 伝えたかったのですが、家庭の事情で忘れてしま...,ごめんね。最近忙しくて。本当は言いたかったんだけど、手が回らなかったの。今日帰ったら絶対言うから。
...,...,...,...,...
103,cejc/apology/translated_query.csv,moredirect,啊。。對不起，我不知道。謝謝你。,阿，不好意思，麻煩你了。
104,cejc/apology/translated_query.csv,moredirect,啊。。是的，我知道。擦。。對不起，我不知道。謝謝你。。這已經是事實了，不是嗎？。所以說，髒點...,啊，不好意思麻煩你了。讓我擦一下。雖然說實在的髒髒的也沒關係啦...
105,cejc/apology/translated_query.csv,moredirect,And+the+kids+mikoshi+came+out+well+the+first+b...,誒，小朋友開始扛神轎的時候，誒...一開始是輪到石井休息。小朋友往前一點之後就換舞獅上場。這...
106,cejc/request/translated_query.csv,lessdirect,我想讓你現在就吃。,可是你不現在吃的話，就不好吃了...


In [9]:
import logging
import sacrebleu
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

data_dir = f'data/{ver_name}/'
train_df = pd.read_csv(f"{data_dir}train.csv").astype(str)
eval_df = pd.read_csv(f"{data_dir}val.csv").astype(str)
# train_df["prefix"] = ""
# eval_df["prefix"] = ""
display(eval_df)

model_args = T5Args()
model_args.length_penalty = 20
model_args.max_seq_length = 256
model_args.train_batch_size = 4
model_args.eval_batch_size = 4
model_args.num_train_epochs = 10
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 500
model_args.use_multiprocessing = False
model_args.fp16 = False
model_args.early_stopping_metric = 'eval_loss'
model_args.early_stopping_metric_minimize = True
model_args.early_stopping_patience = 3
model_args.use_early_stopping = True
model_args.save_eval_checkpoints = True
model_args.save_eval_checkpoints = False
# model_args.learning_rate = 3e-5
model_args.learning_rate = 3e-8
model_args.best_model_dir = f'outputs/{ver_name}/best_model/'
model_args.output_dir = f'outputs/{ver_name}/ckpt/'
model_args.save_model_every_epoch = True
model_args.save_steps = -1
model_args.no_cache = True
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.preprocess_inputs = False
model_args.num_return_sequences = 1
model_args.wandb_project = ver_name

model = T5Model("mt5", f'outputs/100_culturize_all_both_lenpenalty20/best_model/', args=model_args, cuda_device=1)
# Train the model
os.environ['WANDB_CONSOLE'] = 'off'
model.train_model(train_df[['prefix','input_text','target_text']], eval_data=eval_df[['prefix','input_text','target_text']])


Unnamed: 0.1,Unnamed: 0,fname,prefix,input_text,target_text
0,5,mpdd/apology/translated_query.csv,lessdirect,謝るのは当然のことです。 気持ちはわかるけど、...... 気持ちは2人の人事です、わかりますか？,謝るのは私の方だよ。そういう風に思わてるって知ってたんだから。でも、そういうのは二人のことだ...
1,15,mpdd/request/translated_query.csv,lessdirect,ジジュン、もう少し静かにしてくれないか？ 早朝のお喋りしか聞こえてこない! こんなに騒いでた...,静かにしてくれ。朝からギャーギャー言わないでくれ。こんなうるさい女を嫁にしたい男がいると思うか？
2,25,mpdd/request/translated_res.csv,lessdirect,情熱の瞬間に人を傷つけたいのか？ 鶯と魏が今どれだけ動揺して嫌われているか知っているのか？,切羽詰まったら何してもいいってこと？ どれだけ二人がつらい思いしたか想像できる？
3,35,mpdd/request/translated_res.csv,lessdirect,皆さん、本当にごめんなさい! カップルはここの結婚式場が綺麗でロマンチックだとは思っていませ...,特に出来の良い子が、いろいろと事情があって今日まで結婚式を挙げられなかったのですが……。はい...
4,45,mpdd/request/translated_res.csv,lessdirect,爺さんが話す必要はない、俺はマジュンと白鳩に話しかけてくる。 彼らの家族は林野局の出身者です...,工場長じゃなくて、馬軍、白鴿に言えばいいよ。みんな林業局の一家だし、聴いてもらえるんじゃないかな。
5,55,mpdd/request/translated_query.csv,lessdirect,先生のお母様、生徒一人一人の親として、対等であるべきです。 クラスの楊貴妃が劉延を追いかけて...,私たちみんなあなたの学生ですよね。楊さんは劉さんが好きみたいだけど、上手くいっていないらしい...
6,65,mpdd/request/translated_query.csv,lessdirect,今日、私、陳志明は、ここにいる皆さんに証言を求めます、私は、私のガールフレンドである張愛との...,誓いの言葉を聴いてください。私、陳子明は、恋人の張愛を一生大事にします。絶対に傷つけませんし...
7,75,mpdd/request/translated_query.csv,lessdirect,そう願いましょう。 もちろんシャオドンは下手くそではないし、スッピンなので、こんなゴタゴタを...,そうですね。冬さんは仕事もできるし、根性もあるし、この仕事を十分こなせるでしょう。良ければ、...
8,85,mpdd/thanksgiving/translated_res.csv,lessdirect,趙斌、こんなんじゃないよ？ 過去は水に流そう 俺たちはまだ友達だからな,昔のことだろ。
9,95,mpdd/thanksgiving/translated_res.csv,lessdirect,というか、国が古参にチャンスを与えてくれたんだから、それを大事にしないといけないよね。 この...,私もよくわかってないけど、国がチャンスをくれたのに、もったいないと思わない？落ちたっていいじ...


INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))

INFO:simpletransformers.t5.t5_model: Training started



Using Adafactor for T5


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

INFO:simpletransformers.t5.t5_model:   Starting fine-tuning.
[34m[1mwandb[0m: Currently logged in as: [33mnatsukinateyamashita[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.33 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=22.0, style=ProgressStyle(des…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))





INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=22.0, style=ProgressStyle(des…




INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=22.0, style=ProgressStyle(des…




INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=22.0, style=ProgressStyle(des…




INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=22.0, style=ProgressStyle(des…




INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=22.0, style=ProgressStyle(des…




INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=22.0, style=ProgressStyle(des…




INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=22.0, style=ProgressStyle(des…




INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=22.0, style=ProgressStyle(des…




INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=22.0, style=ProgressStyle(des…




INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))







INFO:simpletransformers.t5.t5_model: Training of outputs/100_culturize_all_both_lenpenalty20/best_model/ model complete. Saved to outputs/600_culturize_all_both_lenpenalty20_direct/ckpt/.


(220,
 {'global_step': [22, 44, 66, 88, 110, 132, 154, 176, 198, 220],
  'eval_loss': [3.103783130645752,
   3.1018369992574057,
   3.104524294535319,
   3.102001428604126,
   3.1018239657084146,
   3.1066388289133706,
   3.105625867843628,
   3.107701222101847,
   3.109158913294474,
   3.107808748881022],
  'train_loss': [2.792762279510498,
   2.4923272132873535,
   4.447074890136719,
   1.3848614692687988,
   3.189361572265625,
   2.800020933151245,
   2.674372911453247,
   2.827849864959717,
   2.412982225418091,
   2.2289912700653076]})

# Test

In [10]:

import logging
import sacrebleu
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


model_args = T5Args()
model_args.max_length = 256
model_args.length_penalty = 1
model_args.num_beams = 10

model = T5Model("mt5", f"outputs/{ver_name}/best_model/", args=model_args, cuda_device=1)

In [11]:

eval_df = pd.read_csv(f"{data_dir}test.csv").astype(str)
display(eval_df)
to_ja_truth = [eval_df.loc[eval_df["fname"].str.contains("mpdd")]["target_text"].tolist()]
to_ja_input = eval_df.loc[eval_df["fname"].str.contains("mpdd")]["input_text"].tolist()

to_zh_truth = [eval_df.loc[eval_df["fname"].str.contains("cejc")]["target_text"].tolist()]
to_zh_input = eval_df.loc[eval_df["fname"].str.contains("cejc")]["input_text"].tolist()

Unnamed: 0.1,Unnamed: 0,fname,prefix,input_text,target_text
0,0,cejc/request/translated_query.csv,moredirect,這樣的話，+如果我不在現場，耀世賣了，+也許我可以給耀世一些保證金。,如果真的要把工作交給耀西的話...能不能給他好一點的利潤啊？
1,10,mpdd/request/translated_query.csv,lessdirect,頼むから父の革命家の顔のためにも 解放してくれよ！ よろしくお願いします！」と言っていました...,父の革命家としての顔を立てて、お願い出来ませんか。お願いいたします。
2,20,mpdd/request/translated_query.csv,moredirect,李華、礼節をわきまえてください、算数の問題を議論しているんですよ! 何を知ってるんだ！,今むずかしい数学の問題を解いてるんだよ。見ればわかるだろ？ 後にしてくれ。
3,30,mpdd/request/translated_res.csv,lessdirect,柯さん、外見は軟弱だけど、言葉のキレがすごいですね!,お話するまで、こんなに鋭い方だとは思いませんでした。
4,40,mpdd/request/translated_res.csv,moredirect,主催者変更申請の報告。 これは管理事務所のためのものです。 郭爺に見せればいいんだよ。,「主催者変更の申請レポート」これは管理所の仕事だよ。郭さんに見せてくれ。
5,50,mpdd/request/translated_res.csv,lessdirect,君は自分の仕事をして 私は急いでいない,お気になさらず。私は急ぎませんから。
6,60,mpdd/request/translated_query.csv,moredirect,これは、あまり便利ではありません。 一つには、正午に事務所で休まなければならないこと、もう一...,それは困ります。私はお昼はここで休みます。それに、男女が同じオフィスにいるのもあれでしょうう...
7,70,mpdd/request/translated_query.csv,lessdirect,劉さん、あなたの選択に敬意を表します！私の心の中では素晴らしい女性です！今後の幸せな結婚をお...,そうですね。僕が間違っていました。許してください。
8,80,mpdd/request/translated_query.csv,lessdirect,誰がずっと待ってたんだよｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗｗ...,何言ってんの？ 奥様に用事があってきただけだから、誤解しないでくれる？
9,90,mpdd/thanksgiving/translated_res.csv,lessdirect,あの日のあなたは茨に覆われたハリネズミのようで、今日のあなたはあの日とは別人のようです! 態...,ずいぶん雰囲気が柔らかくなりましたね。もうひとつお願いしてもよろしいですか？ 友人として付き...


In [12]:
# Predict
to_ja_preds = model.predict(to_ja_input)
to_ja_bleu = sacrebleu.corpus_bleu(to_ja_preds, to_ja_truth)
print("--------------------------")
print("to_ja_bleu: ", to_ja_bleu.score)

to_zh_preds = model.predict(to_zh_input)

to_zh_bleu = sacrebleu.corpus_bleu(to_zh_preds, to_zh_truth)
print("--------------------------")
print("to_zh_bleu: ", to_zh_bleu.score)

HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=2.0, style=ProgressStyle(descrip…






HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=9.0, style=ProgressStyle(descripti…


--------------------------
to_ja_bleu:  4.826217438701122


HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=1.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=2.0, style=ProgressStyle(descripti…


--------------------------
to_zh_bleu:  2.3040887376159365


In [13]:
to_ja_preds.extend(to_zh_preds)
to_ja_truth_ = to_ja_truth[0]
to_ja_truth_.extend(to_zh_truth[0])

r_df = pd.DataFrame([to_ja_preds,to_ja_truth_],index=[f'{ver_name}_preds', 'truth'])
r_df.T.to_csv(f'outputs/{ver_name}/preds_truth.csv',encoding='utf_8_sig')

In [14]:
blue_df= pd.DataFrame([to_ja_bleu.score,to_zh_bleu.score], index=['to_ja_bleu.score','to_zh_bleu.score'])
blue_df.to_csv(f'outputs/{ver_name}/bluescore.csv',encoding='utf_8_sig')