## createDatasets

In [None]:
# -*- coding: utf-8 -*-

import codecs
import pickle as cPickle
import random
from collections import defaultdict
import pandas as pd

#set up output file
fout_src_train = codecs.open('/experiment_1/src_train.txt','wb','utf-8')
fout_tgt_train = codecs.open('/experiment_1/tgt_train.txt','wb','utf-8')
fout_src_valid = codecs.open('/experiment_1/src_valid.txt','wb','utf-8')
fout_tgt_valid = codecs.open('/experiment_1/tgt_valid.txt','wb','utf-8')
fout_src_test = codecs.open('/experiment_1/src_test.txt','wb','utf-8')
fout_tgt_test = codecs.open('/experiment_1/tgt_test.txt','wb','utf-8')

#read in data
fin = codecs.open('/experiment_1/english_merged.txt','rb','utf-8')

sources = []
targets = []

for line in fin:
	parts = line.strip().split()
	lemma = parts[2]
	form = parts[3]
	sources.append(' '.join(lemma))
	targets.append(' '.join(form))
fin.close()

pairs = list(zip(sources,targets))
random.shuffle(pairs)

#split into train, valid and test (8-1-1)
train = pairs[:int(.8*len(pairs))]
valid = pairs[int(.8*len(pairs)):int(.9*len(pairs))]
test = pairs[int(.9*len(pairs)):]

#write the outputs
for s,t in train:
	fout_src_train.write(s + '\n')
	fout_tgt_train.write(t + '\n')

for s,t in valid:
	fout_src_valid.write(s + '\n')
	fout_tgt_valid.write(t + '\n')

for s,t in test:
	fout_src_test.write(s + '\n')
	fout_tgt_test.write(t + '\n')



fout_src_train.close()
fout_tgt_train.close()
fout_src_valid.close()
fout_tgt_valid.close()
fout_src_test.close()
fout_tgt_test.close()

## Training

In [None]:
!pip install OpenNMT-py

In [None]:
!onmt_build_vocab -config /experiment_1/config.yaml -n_sample 16200

In [None]:
!onmt_train -config /experiment_1/config.yaml 

train set

In [None]:
!onmt_translate -model /experiment_1/run/model_step_16155.pt -src /experiment_1/src_train.txt -output /experiment_1/pre_train.txt

[2022-03-23 09:45:09,328 INFO] Translating shard 0.
  self._batch_index = self.topk_ids // vocab_size
[2022-03-23 09:45:15,740 INFO] PRED AVG SCORE: -0.0002, PRED PPL: 1.0002


val set

In [None]:
!onmt_translate -model /experiment_1/run/model_step_16155.pt -src /experiment_1/src_valid.txt -output /experiment_1/pre_valid.txt

[2022-03-23 09:45:19,371 INFO] Translating shard 0.
  self._batch_index = self.topk_ids // vocab_size
[2022-03-23 09:45:20,239 INFO] PRED AVG SCORE: -0.0006, PRED PPL: 1.0006


test set

In [None]:
!onmt_translate -model /experiment_1/run/model_step_16155.pt -src /experiment_1/src_test.txt -output /experiment_1/pre_test.txt

[2022-03-23 09:45:24,263 INFO] Translating shard 0.
  self._batch_index = self.topk_ids // vocab_size
[2022-03-23 09:45:25,113 INFO] PRED AVG SCORE: -0.0010, PRED PPL: 1.0010


## Analysis

In [None]:
import pandas as pd
data = pd.read_csv("/experiment_1/english_merged.txt",sep='\t',header=None)
data.columns=["lemma","form","IPA_l","IPA_f","label"]
print(data[:5])

print("number of regular verbs:", len(data.loc[data["label"]=="reg"]))
print("number of irregular verbs:", len(data.loc[data["label"]=="irreg"]))

        lemma          form      IPA_l       IPA_f label
0        fawn        fawned       fO:n       fO:nd   reg
1  understudy  understudied  Vnd@stVdI  Vnd@stVdId   reg
2      pardon      pardoned     pA:dn,     pA:dn,d   reg
3       retch       retched      ri:tS      ri:tSt   reg
4  predestine   predestined  prIdEstIn  prIdEstInd   reg
number of regular verbs: 3871
number of irregular verbs: 168


In [None]:
def DataFrame_file(filename):
  '''
  read the file and convert it to a dataframe
  '''
  with open(filename,'r') as f:
      list_file = [line.strip('\n').replace(" ","") for line in f]
  return pd.DataFrame(list_file)

def results_set(file_tgt,file_pre):
  '''
  calculate the accuracy of regular and irregular verbs in the set and the accuracy of the set, 
  and return them
  '''
  df_pre = DataFrame_file(file_pre)
  df_tgt = DataFrame_file(file_tgt)
  df_set = pd.concat([df_pre,df_tgt],axis=1)
  df_set.columns = ["prediction","IPA_f"]

  df_set_merged = pd.merge(data, df_set)
  df_set_wrong = df_set_merged.loc[df_set_merged["prediction"]!=df_set_merged["IPA_f"]]

  total_reg = len(df_set_merged.loc[df_set_merged["label"]=="reg"])
  total_ir = len(df_set_merged.loc[df_set_merged["label"]=="irreg"])
  wrong_reg = len(df_set_wrong.loc[df_set_wrong["label"]=="reg"])
  wrong_ir = len(df_set_wrong.loc[df_set_wrong["label"]=="irreg"])

  print(f"accuracy of reg verbs: {round(1-wrong_reg/total_reg,4)}")
  print(f"accuracy of irreg verbs: {round(1-wrong_ir/total_ir,4)}")
  print(f"accuracy of the set: {round(1-len(df_set_wrong)/len(df_set),4)}")

tgt_train = "/experiment_1/tgt_train.txt"
pre_train = "/experiment_1/pre_train.txt"
print("train set:")
results_set(tgt_train,pre_train)

tgt_valid = "/experiment_1/tgt_valid.txt"
pre_valid = "/experiment_1/pre_valid.txt"
print("dev set:")
results_set(tgt_valid,pre_valid)

tgt_test = "/experiment_1/tgt_test.txt"
pre_test = "/experiment_1/pre_test.txt"
print("test set:")
results_set(tgt_test,pre_test)

train set:
accuracy of reg verbs: 0.9987
accuracy of irreg verbs: 0.9855
accuracy of the set: 0.9981
dev set:
accuracy of reg verbs: 0.9876
accuracy of irreg verbs: 0.1
accuracy of the set: 0.9653
test set:
accuracy of reg verbs: 0.9923
accuracy of irreg verbs: 0.2273
accuracy of the set: 0.9505
