In [None]:
# install the package
!pip install OpenNMT-py

In [None]:
import random
import pandas as pd

In [None]:
def DataFrame_merged(merged_file):
  """
  read the merged file and convert it to a dataframe, and get some basic information about it.
  """

  data = pd.read_csv(merged_file,sep='\t',names=["pre_tense","past_tense","IPA_pre","IPA_past","label"])
  print(data[:5])

  print("number of regular verbs:", len(data.loc[data["label"]=="reg"]))
  print("number of irregular verbs:", len(data.loc[data["label"]=="irreg"]))
  return data
  
def DataFrame_file(filename):
  """
  read the file and convert it to a dataframe
  """
  with open(filename,'r') as f:
      list_file = [line.strip('\n').replace(" ","") for line in f]
  return pd.DataFrame(list_file)

In [None]:
def evaluation(file_src,file_pre,file_tgt):
  '''
  calculate the accuracy of regular and irregular verbs in the set and the accuracy of the set, 
  and return them
  '''
  df_pre = DataFrame_file(file_pre)
  df_src = DataFrame_file(file_src)
  df_tgt = DataFrame_file(file_tgt)
  df_set = pd.concat([df_pre,df_src,df_tgt],axis=1)
  df_set.columns = ["prediction","pre_tense","past_tense"]

  df_set_merged = pd.merge(data, df_set)
  df_set_wrong = df_set_merged.loc[df_set_merged["prediction"]!=df_set_merged["past_tense"]]

  total_reg = len(df_set_merged.loc[df_set_merged["label"]=="reg"])
  total_ir = len(df_set_merged.loc[df_set_merged["label"]=="irreg"])
  wrong_reg = len(df_set_wrong.loc[df_set_wrong["label"]=="reg"])
  wrong_ir = len(df_set_wrong.loc[df_set_wrong["label"]=="irreg"])

  print("Set\tRegular\tIrregular\n"+"-"*40)
  print(f"{round(1-len(df_set_wrong)/len(df_set),4)}\t"
    f"{round(1-wrong_reg/total_reg,4)}\t"
    f"{round(1-wrong_ir/total_ir,4)}"
    )

### English

In [None]:
!onmt_build_vocab -config grapheme/config_english_g.yaml -n_sample 24400

In [None]:
!onmt_train -config grapheme/config_english_g.yaml

In [None]:
!onmt_translate -model grapheme/english/run/model_step_12200.pt -src grapheme/english/src_train.txt -output grapheme/english/pre_train.txt
!onmt_translate -model grapheme/english/run/model_step_12200.pt -src grapheme/english/src_valid.txt -output grapheme/english/pre_valid.txt
!onmt_translate -model grapheme/english/run/model_step_12200.pt -src grapheme/english/src_test.txt -output grapheme/english/pre_test.txt

### Dutch

In [None]:
!onmt_build_vocab -config grapheme/config_dutch_g.yaml -n_sample 24480

In [None]:
!onmt_train -config grapheme/config_dutch_g.yaml

In [None]:
!onmt_translate -model grapheme/dutch/run/model_step_12240.pt -src grapheme/dutch/src_train.txt -output grapheme/dutch/pre_train.txt
!onmt_translate -model grapheme/dutch/run/model_step_12240.pt -src grapheme/dutch/src_valid.txt -output grapheme/dutch/pre_valid.txt
!onmt_translate -model grapheme/dutch/run/model_step_12240.pt -src grapheme/dutch/src_test.txt -output grapheme/dutch/pre_test.txt

### German

In [None]:
!onmt_build_vocab -config grapheme/config_german_g.yaml -n_sample 24400

In [None]:
!onmt_train -config grapheme/config_german_g.yaml

In [None]:
!onmt_translate -model grapheme/german/run/model_step_12200.pt -src grapheme/german/src_train.txt -output grapheme/german/pre_train.txt
!onmt_translate -model grapheme/german/run/model_step_12200.pt -src grapheme/german/src_valid.txt -output grapheme/german/pre_valid.txt
!onmt_translate -model grapheme/german/run/model_step_12200.pt -src grapheme/german/src_test.txt -output grapheme/german/pre_test.txt

## Evaluation

In [None]:
merged_file = "grapheme/_merged.txt"
data = DataFrame_merged(merged_file)

src_train = "grapheme/_/src_train.txt"
pre_train = "grapheme/_/pre_train.txt"
tgt_train = "grapheme/_/tgt_train.txt"
print("train set:")
evaluation(src_train)

src_valid = "grapheme/_/src_valid.txt"
pre_valid = "grapheme/_/pre_valid.txt"
tgt_valid = "grapheme/_/tgt_valid.txt"
print("dev set:")
evaluation(src_valid)

src_test = "grapheme/_/src_test.txt"
pre_test = "grapheme/_/pre_test.txt"
tgt_test = "grapheme/_/tgt_test.txt"
print("test set:")
evaluation(src_test)