# Data Preparing

In [1]:
import os
import csv
import random
import numpy as np
import pandas as pd

In [2]:
corpus_list = ['cejc','mpdd']
situation_list = ['apology','request','thanksgiving']
sen_type_list = ['query','res']
src_type = 'original' #'translated'
ver_name = '300_culturizefromT5train_all_both'
context_len = 0

data_ver_name = '000_translate_all_both'

data_dir = f'outputs/context/{data_ver_name}/{context_len}/'
bestmodel_dir = f'outputs/context/{ver_name}/{context_len}/best_model/'
save_dir = f'outputs/context/{ver_name}/{context_len}/'


In [3]:
def get_data_as_list(path):
    data = []
    with open(path, 'r', encoding='utf-8-sig')as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(row[0])
    return data

def get_df(corpus_list, situation_list, sen_type_list, src_type, context_len, train_type):
    target_text = []
    input_text = []
    prefix = []
    for corpus in corpus_list:
        for situation in situation_list:
            for sen_type in sen_type_list:
                f_path = f'/nfs/nas-7.1/yamashita/LAB/dialogue_data/data/{corpus}/{situation}/{context_len}/rewrited_{sen_type}_{train_type}'
                target_text += get_data_as_list(f_path)
                f_path = f'/nfs/nas-7.1/yamashita/LAB/dialogue_data/data/{corpus}/{situation}/{context_len}/{src_type}_{sen_type}_{train_type}'
                input_text += get_data_as_list(f_path)
                prefix += [f'{corpus} {situation} {sen_type}']*len(get_data_as_list(f_path))
    df = pd.DataFrame([prefix,input_text,target_text], index=['prefix','input_text','target_text']).astype(str).T
    return df

# Make Dataset

In [4]:

import logging
import sacrebleu
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


model_args = T5Args()
model_args.max_length = 128
model_args.length_penalty = 20
model_args.num_beams = 10

model_ver_name = '000_translate_all_both'
bestmodel_dir = f'outputs/context/{model_ver_name}/{context_len}/best_model/'

model = T5Model("mt5", bestmodel_dir, args=model_args,cuda_device=1)
# model = T5Model("mt5", bestmodel_dir, args=model_args)

In [5]:
train_type = 'train'  
train_df = get_df(corpus_list, situation_list, sen_type_list, src_type, context_len, train_type)

to_ja_truth = train_df.loc[train_df["prefix"].str.contains("mpdd")]["target_text"].tolist()
to_ja_input = train_df.loc[train_df["prefix"].str.contains("mpdd")]["input_text"].tolist()
to_ja_prefix = train_df.loc[train_df["prefix"].str.contains("mpdd")]["prefix"].tolist()

to_zh_truth = train_df.loc[train_df["prefix"].str.contains("cejc")]["target_text"].tolist()
to_zh_input = train_df.loc[train_df["prefix"].str.contains("cejc")]["input_text"].tolist()
to_zh_prefix = train_df.loc[train_df["prefix"].str.contains("cejc")]["prefix"].tolist()

to_ja_preds = model.predict(to_ja_input)
to_zh_preds = model.predict(to_zh_input)

input_text = to_ja_preds+to_zh_preds
train_df["input_text"] = input_text
target_text = to_ja_truth+to_zh_truth
train_df["target_text"] = target_text
prefix = to_ja_prefix+to_zh_prefix
train_df["prefix"] = prefix



HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=158.0, style=ProgressStyle(descr…






HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=1262.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=172.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=1374.0, style=ProgressStyle(descri…




In [6]:
train_type = 'val'  
eval_df = get_df(corpus_list, situation_list, sen_type_list, src_type, context_len, train_type)

to_ja_truth = eval_df.loc[eval_df["prefix"].str.contains("mpdd")]["target_text"].tolist()
to_ja_input = eval_df.loc[eval_df["prefix"].str.contains("mpdd")]["input_text"].tolist()
to_ja_prefix = eval_df.loc[eval_df["prefix"].str.contains("mpdd")]["prefix"].tolist()

to_zh_truth = eval_df.loc[eval_df["prefix"].str.contains("cejc")]["target_text"].tolist()
to_zh_input = eval_df.loc[eval_df["prefix"].str.contains("cejc")]["input_text"].tolist()
to_zh_prefix = eval_df.loc[eval_df["prefix"].str.contains("cejc")]["prefix"].tolist()

to_ja_preds = model.predict(to_ja_input)
to_zh_preds = model.predict(to_zh_input)

input_text = to_ja_preds+to_zh_preds
eval_df["input_text"] = input_text
target_text = to_ja_truth+to_zh_truth
eval_df["target_text"] = target_text
prefix = to_ja_prefix+to_zh_prefix
eval_df["prefix"] = prefix

HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=20.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=158.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=22.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=172.0, style=ProgressStyle(descrip…




In [7]:
train_df["prefix"] = ""
eval_df["prefix"] = ""

display(train_df.iloc[:5])
display(eval_df.iloc[:5])

Unnamed: 0,prefix,input_text,target_text
0,,すみません、本当に飲めないんです。飲めるなら飲めないんじゃないの?,すみません。本当に飲めないんです。飲めるなら絶対に飲んでますよ。
1,,すみません、本当にごめんなさい。,ごめんなさい。本当にお兄さんに申し訳ない。
2,,すみません、他のことは承知しています。でも、この問題は受け入れられないんです。,ごめん、他のことはともかく、これだけはだめなんだ。
3,,ごめん、兄を放してあげてくれない?,ごめん。兄を放してあげてくれない？
4,,そんなこと言うなよ。申し訳ない。,やめてよ。悪いのは助けられない私だから。


Unnamed: 0,prefix,input_text,target_text
0,,ごめんなさい、わざとじゃないんだ。,ごめんね、そんなつもりじゃなかったんだ。
1,,昨日出かけました。ちょっと遅れてしまいました。,今日出てきたばかりなのに、また戻るなんて、本当に申し訳ないです。
2,,すみません、四萬元をくれたんです。,すみません、四万元持ってきましたん。
3,,ごめんね。でも、私には関係ないけど、どっちでもいいじゃん。,謝るのは私の方だよ。そういう風に思わてるって知ってたんだから。でも、そういうのは二人のことだ...
4,,ごめんなさい、助けてもらえません。,それは力になれない。ごめんね。


# Finetune

In [8]:
model_args = T5Args()

model_args.max_seq_length = 128
model_args.length_penalty = 20
model_args.train_batch_size = 2
model_args.eval_batch_size = 2
model_args.num_train_epochs = 20
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 500
model_args.use_multiprocessing = False
model_args.fp16 = False
model_args.early_stopping_metric = 'eval_loss'
model_args.early_stopping_metric_minimize = True
model_args.early_stopping_patience = 3
model_args.use_early_stopping = True
model_args.save_eval_checkpoints = True
model_args.save_eval_checkpoints = False
model_args.learning_rate = 3e-5
model_args.best_model_dir = save_dir+'best_model/'
model_args.output_dir = save_dir+'ckpt/'
model_args.save_model_every_epoch = True
model_args.save_steps = -1
model_args.no_cache = True
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.preprocess_inputs = False
model_args.num_return_sequences = 1
model_args.wandb_project = ver_name

model = T5Model("mt5", "google/mt5-base", args=model_args, cuda_device=1)
# model = T5Model("mt5", "google/mt5-base", args=model_args)
# Train the model
os.environ['WANDB_CONSOLE'] = 'off'
model.train_model(train_df, eval_data=eval_df)

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=2636.0), HTML(value='')))

INFO:simpletransformers.t5.t5_model: Training started



Using Adafactor for T5


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=20.0, style=ProgressStyle(description_width='…

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnatsukinateyamashita[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.33 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 20', max=1318.0, style=ProgressStyle(d…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 20', max=1318.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 20', max=1318.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 20', max=1318.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 20', max=1318.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 20', max=1318.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))







INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 20', max=1318.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 1
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 2
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3
INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: No improvement in eval_loss
INFO:simpletransformers.t5.t5_model: Current step: 3
INFO:simpletransformers.t5.t5_model: Early stopping patience: 3





INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 20', max=1318.0, style=ProgressStyle(d…

INFO:simpletransformers.t5.t5_utils: Creating features from dataset file at cache_dir/


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))






INFO:simpletransformers.t5.t5_model: Patience of 3 steps reached
INFO:simpletransformers.t5.t5_model: Training terminated.






INFO:simpletransformers.t5.t5_model: Training of google/mt5-base model complete. Saved to outputs/context/300_culturizefromT5train_all_both/0/ckpt/.


(9500,
 {'global_step': [500,
   1000,
   1318,
   1500,
   2000,
   2500,
   2636,
   3000,
   3500,
   3954,
   4000,
   4500,
   5000,
   5272,
   5500,
   6000,
   6500,
   6590,
   7000,
   7500,
   7908,
   8000,
   8500,
   9000,
   9226,
   9500],
  'eval_loss': [10.239910767295143,
   5.71810031226187,
   4.85611592206088,
   4.4295251940235945,
   3.966494723883542,
   3.621848180438533,
   3.5723213779203817,
   3.4983110899275,
   3.380092240824844,
   3.292085610510725,
   3.290008735340653,
   3.2256622273362043,
   3.1887129399361034,
   3.192090814221989,
   3.1859081103946223,
   3.16193455642823,
   3.1497862697099195,
   3.1507292314460784,
   3.1548830702449338,
   3.1104407728621455,
   3.124794711985371,
   3.1201769626953384,
   3.117850523780693,
   3.1287690742900875,
   3.1238782767093545,
   3.1130690496979336],
  'train_loss': [10.37309455871582,
   13.879570960998535,
   6.392226696014404,
   7.174092769622803,
   7.25306510925293,
   3.1848948001861572,
  

# Test

In [9]:

import logging
import sacrebleu
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


model_args = T5Args()
model_args.max_length = 128
model_args.length_penalty = 20
model_args.num_beams = 10

model = T5Model("mt5", save_dir+"best_model/", args=model_args, cuda_device=1)
# model = T5Model("mt5", save_dir+"best_model/", args=model_args)

In [10]:

ja_eval_df = pd.read_csv(f"{data_dir}ja_preds_truth.csv").astype(str)
zh_eval_df = pd.read_csv(f"{data_dir}zh_preds_truth.csv").astype(str)

to_ja_truth = ja_eval_df["truth"].tolist()
to_ja_input = ja_eval_df[f"{data_ver_name}"].tolist()

to_zh_truth = zh_eval_df["truth"].tolist()
to_zh_input = zh_eval_df[f"{data_ver_name}"].tolist()

to_ja_input = [": " + input_text for input_text in to_ja_input]
to_zh_input = [": " + input_text for input_text in to_zh_input]

to_ja_input[:5]

[': 今日の影響はありませんでした。すみません。',
 ': 今更傷ついたよ。ただ、私を照顧してくれない?',
 ': ごめんね、本は読まないんです。',
 ': ごめん、先払いなさい。心が疲れてるから。',
 ': ごめん、遅れてごめん。']

In [11]:
to_ja_preds = model.predict(to_ja_input)
to_zh_preds = model.predict(to_zh_input)

HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=20.0, style=ProgressStyle(descri…






HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=160.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=22.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=174.0, style=ProgressStyle(descrip…




In [12]:
r_ja_df = pd.DataFrame([to_ja_preds,to_ja_truth],index=[f'{ver_name}', 'truth'])
r_ja_df.T.to_csv(save_dir+'ja_preds_truth.csv',encoding='utf_8_sig')

r_zh_df = pd.DataFrame([to_zh_preds,to_zh_truth],index=[f'{ver_name}', 'truth'])
r_zh_df.T.to_csv(save_dir+'zh_preds_truth.csv',encoding='utf_8_sig')