In [None]:
# install the package
!pip install OpenNMT-py

In [None]:
import codecs
import pickle as cPickle
import random
from collections import defaultdict
import pandas as pd
import re

In [None]:
def createDatasets(corpus_file,sequence="grapheme"):
  """
  generate train, valid, test sets. (Match train/test split from experiment 1, by lemma.)
  """

  #set up output file
  fout_src_train = codecs.open(r'/experiment_2/src_train_tagged.txt','wb','utf-8')
  fout_tgt_train = codecs.open(r'/experiment_2/tgt_train_tagged.txt','wb','utf-8')
  fout_src_valid = codecs.open(r'/experiment_2/src_valid_tagged.txt','wb','utf-8')
  fout_src_test = codecs.open(r'/experiment_2/src_test_tagged.txt','wb','utf-8')


  #modify every line in the current valid data
  fin = codecs.open(r'/experiment_1/src_valid.txt','rb','utf-8')
  for line in fin:
    fout_src_valid.write('<V;PST> ' + line)
  fin.close()
  #modify every line in the current test data
  fin = codecs.open(r'/experiment_1/src_test.txt','rb','utf-8')
  for line in fin:
    fout_src_test.write('<V;PST> ' + line)
  fin.close()

  #read in a set of valid lemmas from the current train data
  ok_lemmas = set()
  fin =  codecs.open(r'/experiment_1/src_train.txt','rb','utf-8')
  for line in fin:
    ok_lemmas.add(line.strip())
  fin.close()

  #read in data
  fin = codecs.open(r'/experiment_2/english_merged.txt','rb','utf-8')

  sources = []
  targets = []

  if sequence == "grapheme"
    for line in fin:
      parts = line.strip().split()
      lemma = parts[3]
      form = parts[4]
      vec = '<' + parts[2] + '> '
      if vec != 'V;NFIN' and ' '.join(lemma) in ok_lemmas:
        sources.append(vec + ' '.join(lemma))
        targets.append(' '.join(form))
  else: 
    for line in fin:
      parts = line.strip().split()
      lemma = parts[0]
      form = parts[1]
      vec = '<' + parts[2] + '> '
      if vec != 'V;NFIN' and ' '.join(lemma) in ok_lemmas:
        sources.append(vec + ' '.join(lemma))
        targets.append(' '.join(form))
  fin.close()

  pairs = list(zip(sources,targets))
  random.seed(222)
  random.shuffle(pairs)

  #split into train and test
  train = pairs

  #write the outputs
  for s,t in train:
    fout_src_train.write(s + '\n')
    fout_tgt_train.write(t + '\n')

  fout_src_train.close()
  fout_tgt_train.close()
  fout_src_valid.close()
  fout_src_test.close()

In [None]:
def DataFrame_merged(merged_file):
  """
  read the merged file and convert it to a dataframe, and get some basic information about it.
  """

  data = pd.read_csv(merged_file,sep='\t',names=["pre_tense","past_tense","IPA_pre","IPA_past","label"])
  print(data[:5])

  print("number of regular verbs:", len(data.loc[data["label"]=="reg"]))
  print("number of irregular verbs:", len(data.loc[data["label"]=="irreg"]))
  return data
  
def DataFrame_file(filename,src=True):
  '''
  read the file and convert it to a dataframe
  '''
  with open(filename,'r') as f:
    if src:
      task, pre = [], []
      for line in f:
        task.append(re.findall("<.+>",line.strip('\n'))[0].replace("<","").replace(">",""))
        pre.append(re.sub("<.+>","",line.strip('\n')).replace(" ",""))
        df = pd.DataFrame({"task":task, "IPA_pre":pre})
    else:
      list_file = [line.strip('\n').replace(" ","") for line in f]
      df = pd.DataFrame(list_file)
    return df

In [None]:
def results_set(file_pre,file_src,file_tgt):
  '''
  calculate the accuracy of regular and irregular verbs in the set and the accuracy of the set, 
  and return them
  '''
  df_pre = DataFrame_file(file_pre,src=False)
  df_src = DataFrame_file(file_src,src=True)
  df_tgt = DataFrame_file(file_tgt,src=False)
  df_set = pd.concat([df_pre,df_src,df_tgt],axis=1)

  if sequence == "grapheme":
    df_set.columns = ["prediction","task","pre_tense","past_tense"]
    df_set_merged = pd.merge(data, df_set)
    df_set_wrong = df_set_merged.loc[df_set_merged["prediction"]!=df_set_merged["past_tense"]]
  else:
    df_set.columns = ["prediction","task","IPA_pre","IPA_past"]
    df_set_merged = pd.merge(data, df_set)
    df_set_wrong = df_set_merged.loc[df_set_merged["prediction"]!=df_set_merged["IPA_past"]]

  total_reg = len(df_set_merged.loc[df_set_merged["label"]=="reg"])
  total_ir = len(df_set_merged.loc[df_set_merged["label"]=="irreg"])
  wrong_reg = len(df_set_wrong.loc[df_set_wrong["label"]=="reg"])
  wrong_ir = len(df_set_wrong.loc[df_set_wrong["label"]=="irreg"])

  print("Set\tRegular\tIrregular\n"+"-"*40)
  print(f"{round(1-len(df_set_wrong)/len(df_set),3)}\t"
    f"{round(1-wrong_reg/total_reg,3)}\t"
    f"{round(1-wrong_ir/total_ir,3)}"
    )

In [None]:
!onmt_build_vocab -config /experiment_2/config.yaml -n_sample 75200
!onmt_train -config /experiment_2/config.yaml

In [None]:
!onmt_translate -model /experiment_2/run/model_step_75200.pt -src /experiment_2/src_train_tagged.txt -output experiment_2/pre_train.txt
!onmt_translate -model /experiment_2/run/model_step_75200.pt -src /experiment_2/src_valid_tagged.txt -output experiment_2/pre_valid.txt
!onmt_translate -model /experiment_2/run/model_step_75200.pt -src /experiment_2/src_test_tagged.txt -output experiment_2/pre_test.txt

In [None]:
merged_file = "experiment_2/english_merged.txt"
data = DataFrame_merged(merged_file)

tgt_train = "experiment_2/tgt_train_tagged.txt"
pre_train = "experiment_2/pre_train.txt"
src_train = "experiment_2/src_train_tagged.txt"
print("train set:")
evaluation(tgt_train,pre_train,src_train)

tgt_valid = "experiment_1/tgt_valid.txt"
pre_valid = "experiment_2/pre_valid.txt"
src_valid = "experiment_2/src_valid_tagged.txt"
print("valid set:")
evaluation(tgt_valid,pre_valid,src_valid)

tgt_test = "experiment_1/tgt_test.txt"
pre_test = "experiment_2/pre_test.txt"
src_test = "experiment_2/src_test_tagged.txt"
print("test set:")
evaluation(tgt_test,pre_test,src_test)