# Data Preparing

In [1]:
import os
import csv
import random
import numpy as np
import pandas as pd

In [2]:
situation_list = ['apology','request','thanksgiving']
sen_type_list = ['query','res']
src_type = 'translated' #'translated'
ver_name = '500_culturize_all_both_lenpenalty20_prefixsituaiton'
save_dir = f'data/{ver_name}/'
data_dir = f'data/{ver_name}/'

In [3]:
def get_data_as_list(path):
    data = []
    with open(path, 'r', encoding='utf-8-sig')as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(row[0])
    return data


In [4]:
def get_datadf(situation_list,sen_type_list,src_type):
    df = pd.DataFrame(columns=["input_text", "target_text"])
    for situation in situation_list:
        for sen_type in sen_type_list:
            for corpus in ['mpdd','cejc']:
                if src_type == 'original': 
                    src_path = f'/nfs/nas-7.1/yamashita/LAB/giza-pp/data/{corpus}/{situation}/{src_type}_{sen_type}.csv'
                elif src_type == 'translated':     
                    src_path = f'/nfs/nas-7.1/yamashita/LAB/giza-pp/data/{corpus}/{situation}/{src_type}_{sen_type}.csv'     
                tgt_path = f'/nfs/nas-7.1/yamashita/LAB/giza-pp/data/{corpus}/{situation}/rewrited_{sen_type}.csv'
                
                src_data = get_data_as_list(src_path)
                tgt_data = get_data_as_list(tgt_path)
                
                tmp_df = pd.DataFrame([src_data,tgt_data],index=['input_text','target_text'],columns=[src_path[40:]]*len(src_data))
                tmp_df = tmp_df.T
                
                tmp_df['prefix'] = f'{situation} {sen_type}'
                
                df = pd.concat([df,tmp_df])
    df = df.reset_index().set_axis(['fname','input_text','target_text','prefix'],axis=1)
    return df


In [5]:
# tgt_list = ['ja','zh']
os.makedirs(save_dir, exist_ok=True)

data_df = get_datadf(situation_list,sen_type_list,src_type)

pureidx = np.arange(len(data_df))
val_idx = pureidx[5::10]
test_idx = pureidx[::10]

ind = np.ones(len(data_df), dtype=bool)
ind[val_idx] = False
ind[test_idx] = False
train_idx = pureidx[ind]
# print(len(data_df))
# print(train_idx.shape)
# print(test_idx.shape)
# print(val_idx.shape)

train_df = data_df.iloc[train_idx]
val_df = data_df.iloc[val_idx]
test_df = data_df.iloc[test_idx]

train_df.to_csv(save_dir+'train.csv', index=None, encoding='utf_8_sig')
val_df.to_csv(save_dir+'val.csv', index=None, encoding='utf_8_sig')
test_df.to_csv(save_dir+'test.csv', index=None, encoding='utf_8_sig')


# Finetune

In [6]:
import logging
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

data_dir = f'data/{ver_name}/'
train_df = pd.read_csv(f"{data_dir}train.csv").astype(str)
eval_df = pd.read_csv(f"{data_dir}val.csv").astype(str)
# train_df["prefix"] = ""
# eval_df["prefix"] = ""
train_df

Unnamed: 0,fname,input_text,target_text,prefix
0,mpdd/apology/translated_query.csv,クラスの皆さん、ごめんなさい！ 今日は家族の用事で少し遅れてしまいました。 次は教科書の45...,みなさんごめんなさい！ 今日は家で少し用事があって、ちょっと遅れてしまいました。じゃあ教科書...,apology query
1,mpdd/apology/translated_query.csv,孫校長さん、こんにちは、本当に申し訳ありません、先に帰ります、私が直接お願いしたわけではあり...,もしもし、校長先生ですか？　本当にすみません。お先に失礼いたしました。事前にちゃんと申請せず...,apology query
2,mpdd/apology/translated_query.csv,すみません、少し早く聞いてしまいましたが気にしないでくださいね!,ごめんね。勘違いしてた。気にしないで。,apology query
3,mpdd/apology/translated_query.csv,鄭鵬、ごめんね、目をそらしてしまった。 今は見れば見るほど奥さんに似ている。 息子さんもかな...,ごめん、勘違いしてた。確かにお前の奥さんだよね。子供もおりこうそうだし、奥さんも本当に目がき...,apology query
4,mpdd/apology/translated_query.csv,ママ、あなたたちって意地悪だよね! 病気なのにまだ隠してたのかよ、誰に言われたんだよ、絶対に...,あいつが病気だってなんで黙ってたんだよ。もっと早く気づいてたら、大きい病院に入れられただろ。...,apology query
...,...,...,...,...
2635,cejc/thanksgiving/translated_res.csv,那麼，Shuppa。,那...,thanksgiving res
2636,cejc/thanksgiving/translated_res.csv,是的，我知道,好啦，你就快吃吧。,thanksgiving res
2637,cejc/thanksgiving/translated_res.csv,謝謝你。,不會啦，謝謝你。,thanksgiving res
2638,cejc/thanksgiving/translated_res.csv,謝謝你。,謝謝你。,thanksgiving res


In [7]:
model_args = T5Args()
model_args.length_penalty = 20
model_args.max_seq_length = 256
model_args.train_batch_size = 4
model_args.eval_batch_size = 4
model_args.num_train_epochs = 20
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 500
model_args.use_multiprocessing = False
model_args.fp16 = False
model_args.early_stopping_metric = 'eval_loss'
model_args.early_stopping_metric_minimize = True
model_args.early_stopping_patience = 3
model_args.use_early_stopping = True
model_args.save_eval_checkpoints = True
model_args.save_eval_checkpoints = False
model_args.learning_rate = 3e-5
model_args.best_model_dir = f'outputs/{ver_name}/best_model/'
model_args.output_dir = f'outputs/{ver_name}/ckpt/'
model_args.save_model_every_epoch = True
model_args.save_steps = -1
model_args.no_cache = True
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.preprocess_inputs = False
model_args.num_return_sequences = 1
model_args.wandb_project = ver_name

model = T5Model("mt5", "google/mt5-base", args=model_args, cuda_device=1)
# Train the model
os.environ['WANDB_CONSOLE'] = 'off'
model.train_model(train_df[['prefix','input_text','target_text']], eval_data=eval_df[['prefix','input_text','target_text']])

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=2640.0), HTML(value='')))

INFO:simpletransformers.t5.t5_model: Training started



Using Adafactor for T5


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=20.0, style=ProgressStyle(description_width='…

[34m[1mwandb[0m: Currently logged in as: [33mnatsukinateyamashita[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.33 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 20', max=660.0, style=ProgressStyle(de…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3





INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 20', max=660.0, style=ProgressStyle(de…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 20', max=660.0, style=ProgressStyle(de…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 20', max=660.0, style=ProgressStyle(de…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 20', max=660.0, style=ProgressStyle(de…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 20', max=660.0, style=ProgressStyle(de…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 20', max=660.0, style=ProgressStyle(de…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 20', max=660.0, style=ProgressStyle(de…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 20', max=660.0, style=ProgressStyle(de…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 20', max=660.0, style=ProgressStyle(de…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 10 of 20', max=660.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 11 of 20', max=660.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 12 of 20', max=660.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 13 of 20', max=660.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 14 of 20', max=660.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3





INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 15 of 20', max=660.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 2
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 16 of 20', max=660.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3





INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 17 of 20', max=660.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 2
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3





INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 18 of 20', max=660.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 19 of 20', max=660.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3





INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_model: Training of google/mt5-base model complete. Saved to outputs/500_culturize_all_both_lenpenalty20_prefixsituaiton/ckpt/.


(13200,
 {'global_step': [500,
   660,
   1000,
   1320,
   1500,
   1980,
   2000,
   2500,
   2640,
   3000,
   3300,
   3500,
   3960,
   4000,
   4500,
   4620,
   5000,
   5280,
   5500,
   5940,
   6000,
   6500,
   6600,
   7000,
   7260,
   7500,
   7920,
   8000,
   8500,
   8580,
   9000,
   9240,
   9500,
   9900,
   10000,
   10500,
   10560,
   11000,
   11220,
   11500,
   11880,
   12000,
   12500,
   12540,
   13000,
   13200],
  'eval_loss': [8.717052528657108,
   7.349445343017578,
   5.42400217630777,
   4.806024603096835,
   4.617515983351742,
   4.144890146083142,
   4.135163541299751,
   3.8356831892427192,
   3.7898039889622885,
   3.6441776407770363,
   3.5867157341486,
   3.5688836430928794,
   3.5017839483467927,
   3.5232744877596938,
   3.4633953614407273,
   3.4480710101414878,
   3.397702711174287,
   3.376139291797776,
   3.351023044930883,
   3.2869708006640517,
   3.2884069925331207,
   3.190956069762448,
   3.1710900053920517,
   3.1575078303555406,
  

# Test

In [8]:

import logging
import sacrebleu
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


model_args = T5Args()
model_args.max_length = 256
model_args.length_penalty = 1
model_args.num_beams = 10

model = T5Model("mt5", f"outputs/{ver_name}/best_model/", args=model_args, cuda_device=1)

In [9]:
eval_df = pd.read_csv(f"{data_dir}test.csv").astype(str)

to_ja_truth = [eval_df.loc[eval_df["fname"].str.contains("mpdd")]["target_text"].tolist()]
to_ja_input = eval_df.loc[eval_df["fname"].str.contains("mpdd")]["input_text"].tolist()

to_zh_truth = [eval_df.loc[eval_df["fname"].str.contains("cejc")]["target_text"].tolist()]
to_zh_input = eval_df.loc[eval_df["fname"].str.contains("cejc")]["input_text"].tolist()

In [10]:
# Predict
to_ja_preds = model.predict(to_ja_input)
to_ja_bleu = sacrebleu.corpus_bleu(to_ja_preds, to_ja_truth)
print("--------------------------")
print("to_ja_bleu: ", to_ja_bleu.score)

to_zh_preds = model.predict(to_zh_input)

to_zh_bleu = sacrebleu.corpus_bleu(to_zh_preds, to_zh_truth)
print("--------------------------")
print("to_zh_bleu: ", to_zh_bleu.score)

HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=20.0, style=ProgressStyle(descri…






HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=159.0, style=ProgressStyle(descrip…


--------------------------
to_ja_bleu:  0.17748261991524125


HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=22.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=171.0, style=ProgressStyle(descrip…


--------------------------
to_zh_bleu:  0.5069245434129408


In [11]:
to_ja_preds.extend(to_zh_preds)
to_ja_truth_ = to_ja_truth[0]
to_ja_truth_.extend(to_zh_truth[0])

r_df = pd.DataFrame([to_ja_preds,to_ja_truth_],index=[f'{ver_name}_preds', 'truth'])
r_df.T.to_csv(f'outputs/{ver_name}/preds_truth.csv',encoding='utf_8_sig')

In [12]:
blue_df= pd.DataFrame([to_ja_bleu.score,to_zh_bleu.score], index=['to_ja_bleu.score','to_zh_bleu.score'])
blue_df.to_csv(f'outputs/{ver_name}/bluescore.csv',encoding='utf_8_sig')