## createDatasets

In [None]:
# -*- coding: utf-8 -*-
"""Match train/test split from grapheme 1, by lemma."""
import codecs
import pickle as cPickle
import random
from collections import defaultdict
import pandas as pd

#set up output file
fout_src_train = codecs.open(r'/grapheme_2/src_train_tagged.txt','wb','utf-8')
fout_tgt_train = codecs.open(r'/grapheme_2/tgt_train_tagged.txt','wb','utf-8')
fout_src_valid = codecs.open(r'/grapheme_2/src_valid_tagged.txt','wb','utf-8')
fout_src_test = codecs.open(r'/grapheme_2/src_test_tagged.txt','wb','utf-8')

#modify every line in the current valid data
fin = codecs.open(r'/grapheme_1/src_valid.txt','rb','utf-8')
for line in fin:
	fout_src_valid.write('<V;PST> ' + line)
fin.close()
#modify every line in the current test data
fin = codecs.open(r'/grapheme_1/src_test.txt','rb','utf-8')
for line in fin:
	fout_src_test.write('<V;PST> ' + line)
fin.close()

#read in a set of valid lemmas from the current train data
ok_lemmas = set()
fin =  codecs.open(r'/grapheme_1/src_train.txt','rb','utf-8')
for line in fin:
	ok_lemmas.add(line.strip())
fin.close()

#read in data
fin = codecs.open(r'/grapheme_2/english_merged.txt','rb','utf-8')

sources = []
targets = []

for line in fin:
	parts = line.strip().split()
	lemma = parts[0]
	form = parts[1]
	vec = '<' + parts[2] + '> '
	if vec != 'V;NFIN' and ' '.join(lemma) in ok_lemmas:
		sources.append(vec + ' '.join(lemma))
		targets.append(' '.join(form))	
fin.close()

pairs = list(zip(sources,targets))
random.shuffle(pairs)

#split into train and test
train = pairs

#write the outputs
for s,t in train:
	fout_src_train.write(s + '\n')
	fout_tgt_train.write(t + '\n')

fout_src_train.close()
fout_tgt_train.close()
fout_src_valid.close()
fout_src_test.close()

## Training

In [None]:
!pip install OpenNMT-py

In [None]:
!onmt_build_vocab -config /grapheme_2/config.yaml -n_sample 74730

In [None]:
!onmt_train -config /grapheme_2/config.yaml

In [None]:
#train
!onmt_translate -model /grapheme_2/run/model_step_74730.pt -src /grapheme_2/src_train_tagged.txt -output /grapheme_2/pre_train.txt 

[2022-03-22 15:08:28,670 INFO] Translating shard 0.
  self._batch_index = self.topk_ids // vocab_size
[2022-03-22 15:08:54,885 INFO] PRED AVG SCORE: -0.0000, PRED PPL: 1.0000
[2022-03-22 15:08:54,898 INFO] Translating shard 1.
[2022-03-22 15:09:06,549 INFO] PRED AVG SCORE: -0.0000, PRED PPL: 1.0000


In [None]:
#dev
!onmt_translate -model /grapheme_2/run/model_step_74730.pt -src /grapheme_2/src_valid_tagged.txt -output /grapheme_2/pre_valid.txt 

[2022-03-22 15:09:44,013 INFO] Translating shard 0.
  self._batch_index = self.topk_ids // vocab_size
[2022-03-22 15:09:44,950 INFO] PRED AVG SCORE: -0.0012, PRED PPL: 1.0012


In [None]:
#test
!onmt_translate -model /grapheme_2/run/model_step_74730.pt -src /grapheme_2/src_test_tagged.txt -output /grapheme_2/pre_test.txt 

[2022-03-22 15:09:49,685 INFO] Translating shard 0.
  self._batch_index = self.topk_ids // vocab_size
[2022-03-22 15:09:50,639 INFO] PRED AVG SCORE: -0.0002, PRED PPL: 1.0002


## Analysis

In [None]:
import pandas as pd
data = pd.read_csv("/grapheme_2/english_merged.txt",sep='\t',header=None)
data.columns=["lemma","form","task","IPA_l","IPA_f","label"]
print(data[:5])

print("number of regular verbs:", len(data.loc[data["label"]=="reg"]))
print("number of irregular verbs:", len(data.loc[data["label"]=="irreg"]))

  lemma     form          task IPA_l   IPA_f label
0  fawn   fawned         V;PST  fO:n   fO:nd   reg
1  fawn   fawned  V;V.PTCP;PST  fO:n   fO:nd   reg
2  fawn  fawning  V;V.PTCP;PRS  fO:n  fO:nIN   reg
3  fawn    fawns    V;3;SG;PRS  fO:n   fO:nz   reg
4  fawn     fawn        V;NFIN  fO:n    fO:n   reg
number of regular verbs: 18140
number of irregular verbs: 530


In [None]:
def DataFrame_file(filename):
  '''
  read the file and convert it to a dataframe
  '''
  with open(filename,'r') as f:
      list_file = [line.strip('\n').replace(" ","") for line in f]
  return pd.DataFrame(list_file)

def results_set(file_tgt,file_pre):
  '''
  calculate the accuracy of regular and irregular verbs in the set and the accuracy of the set, 
  and return them
  '''
  df_pre = DataFrame_file(file_pre)
  df_tgt = DataFrame_file(file_tgt)
  df_set = pd.concat([df_pre,df_tgt],axis=1)
  df_set.columns = ["prediction","form"]

  df_set_merged = pd.merge(data, df_set)
  df_set_wrong = df_set_merged.loc[df_set_merged["prediction"]!=df_set_merged["form"]]

  total_reg = len(df_set_merged.loc[df_set_merged["label"]=="reg"])
  total_ir = len(df_set_merged.loc[df_set_merged["label"]=="irreg"])
  wrong_reg = len(df_set_wrong.loc[df_set_wrong["label"]=="reg"])
  wrong_ir = len(df_set_wrong.loc[df_set_wrong["label"]=="irreg"])

  print(f"accuracy of reg verbs: {round(1-wrong_reg/total_reg,4)}")
  print(f"accuracy of irreg verbs: {round(1-wrong_ir/total_ir,4)}")
  print(f"accuracy of the set: {round(1-len(df_set_wrong)/len(df_set),4)}")

tgt_train = "/grapheme_2/tgt_train_tagged.txt"
pre_train = "/grapheme_2/pre_train.txt"
print("train set:")
results_set(tgt_train,pre_train)

tgt_valid = "/grapheme_1/tgt_valid.txt"
pre_valid = "/grapheme_2/pre_valid.txt"
print("dev set:")
results_set(tgt_valid,pre_valid)

tgt_test = "/grapheme_1/tgt_test.txt"
pre_test = "/grapheme_2/pre_test.txt"
print("test set:")
results_set(tgt_test,pre_test)