In [None]:
# install the package
!pip install OpenNMT-py

In [None]:
import codecs
import pickle as cPickle
import random
from collections import defaultdict
import pandas as pd

In [None]:
def createDatasets(corpus_file,sequence="grapheme"):
  """
  generate train, valid, test sets
  """
  #read in data
  fin = codecs.open(corpus_file,'rb','utf-8')

  sources = []
  targets = []

  for line in fin:
    parts = line.strip().split()
    if sequence == "grapheme":
      lemma = parts[0]
      form = parts[1]
      sources.append(' '.join(lemma))
      targets.append(' '.join(form))
    elif sequence == "phoneme":
      lemma = parts[2]
      form = parts[3]
      sources.append(' '.join(lemma))
      targets.append(' '.join(form))
    else:
      print("Input the correct sequence type.")
  fin.close()

  pairs = list(zip(sources,targets))
  random.shuffle(pairs)

  #split into train, valid and test (8-1-1)
  train = pairs[:int(.8*len(pairs))]
  valid = pairs[int(.8*len(pairs)):int(.9*len(pairs))]
  test = pairs[int(.9*len(pairs)):]

  #set up output file
  fout_src_train = codecs.open('/data/src_train.txt','wb','utf-8')
  fout_tgt_train = codecs.open('/data/tgt_train.txt','wb','utf-8')
  fout_src_valid = codecs.open('/data/src_valid.txt','wb','utf-8')
  fout_tgt_valid = codecs.open('/data/tgt_valid.txt','wb','utf-8')
  fout_src_test = codecs.open('/data/src_test.txt','wb','utf-8')
  fout_tgt_test = codecs.open('/data/tgt_test.txt','wb','utf-8')

  #write the outputs
  for s,t in train:
    fout_src_train.write(s + '\n')
    fout_tgt_train.write(t + '\n')

  for s,t in valid:
    fout_src_valid.write(s + '\n')
    fout_tgt_valid.write(t + '\n')

  for s,t in test:
    fout_src_test.write(s + '\n')
    fout_tgt_test.write(t + '\n')

  

  fout_src_train.close()
  fout_tgt_train.close()
  fout_src_valid.close()
  fout_tgt_valid.close()
  fout_src_test.close()
  fout_tgt_test.close()

In [None]:
def DataFrame_merged(merged_file):
  """
  read the merged file and convert it to a dataframe, and get some basic information about it.
  """

  data = pd.read_csv(merged_file,sep='\t',names=["pre_tense","past_tense","IPA_pre","IPA_past","label"])
  print(data[:5])

  print("number of regular verbs:", len(data.loc[data["label"]=="reg"]))
  print("number of irregular verbs:", len(data.loc[data["label"]=="irreg"]))
  return data
  
def DataFrame_file(filename):
  """
  read the file and convert it to a dataframe
  """
  with open(filename,'r') as f:
      list_file = [line.strip('\n').replace(" ","") for line in f]
  return pd.DataFrame(list_file)

In [None]:
def evaluation(file_tgt,file_pre,file_src,sequence="grapheme"):
  '''
  calculate the accuracy of regular and irregular verbs in the set and the accuracy of the set, 
  and return them
  '''
  df_pre = DataFrame_file(file_pre)
  df_tgt = DataFrame_file(file_tgt)
  df_src = DataFrame_file(file_src)
  df_set = pd.concat([df_pre,df_tgt,df_src],axis=1)

  if sequence == "grapheme":
    df_set.columns = ["prediction","past_tense","pre_tense"]
    df_set_merged = pd.merge(data, df_set)
    df_set_wrong = df_set_merged.loc[df_set_merged["prediction"]!=df_set_merged["past_tense"]]
  else:
    df_set.columns = ["prediction","IPA_past","IPA_pre"]
    df_set_merged = pd.merge(data, df_set)
    df_set_wrong = df_set_merged.loc[df_set_merged["prediction"]!=df_set_merged["IPA_past"]]

  total_reg = len(df_set_merged.loc[df_set_merged["label"]=="reg"])
  total_ir = len(df_set_merged.loc[df_set_merged["label"]=="irreg"])
  wrong_reg = len(df_set_wrong.loc[df_set_wrong["label"]=="reg"])
  wrong_ir = len(df_set_wrong.loc[df_set_wrong["label"]=="irreg"])

  print("Set\tRegular\tIrregular\n"+"-"*40)
  print(f"{round(1-len(df_set_wrong)/len(df_set),4)}\t"
    f"{round(1-wrong_reg/total_reg,4)}\t"
    f"{round(1-wrong_ir/total_ir,4)}"
    )

In [None]:
!onmt_build_vocab -config /experiment_1/config.yaml -n_sample 16200
!onmt_train -config /experiment_1/config.yaml

In [None]:
!onmt_translate -model /experiment_1/run/model_step_16200.pt -src /experiment_1/src_train.txt -output experiment_1/pre_train.txt
!onmt_translate -model /experiment_1/run/model_step_16200.pt -src /experiment_1/src_valid.txt -output experiment_1/pre_valid.txt
!onmt_translate -model /experiment_1/run/model_step_16200.pt -src /experiment_1/src_test.txt -output experiment_1/pre_test.txt

In [None]:
merged_file = "experiment_1/english_merged.txt"

tgt_train = "experiment_1/tgt_train.txt"
pre_train = "experiment_1/pre_train.txt"
src_train = "experiment_1/src_train.txt"
print("train set:")
evaluation(tgt_train,pre_train,src_train)

tgt_valid = "experiment_1/tgt_valid.txt"
pre_valid = "experiment_1/pre_valid.txt"
src_valid = "experiment_1/src_valid.txt"
print("valid set:")
evaluation(tgt_valid,pre_valid,src_valid)

tgt_test = "experiment_1/tgt_test.txt"
pre_test = "experiment_1/pre_test.txt"
src_test = "experiment_1/src_test.txt"
print("test set:")
evaluation(tgt_test,pre_test,src_test)