## createDatasets

In [None]:
# -*- coding: utf-8 -*-
"""Match train/test split from experiment 1, by lemma."""
import codecs
import pickle as cPickle
import random
from collections import defaultdict
import pandas as pd

#set up output file
fout_src_train = codecs.open(r'/experiment_2/src_train_tagged.txt','wb','utf-8')
fout_tgt_train = codecs.open(r'/experiment_2/tgt_train_tagged.txt','wb','utf-8')
fout_src_valid = codecs.open(r'/experiment_2/src_valid_tagged.txt','wb','utf-8')
fout_src_test = codecs.open(r'/experiment_2/src_test_tagged.txt','wb','utf-8')

#modify every line in the current valid data
fin = codecs.open(r'/experiment_1/src_valid.txt','rb','utf-8')
for line in fin:
	fout_src_valid.write('<V;PST> ' + line)
fin.close()
#modify every line in the current test data
fin = codecs.open(r'/experiment_1/src_test.txt','rb','utf-8')
for line in fin:
	fout_src_test.write('<V;PST> ' + line)
fin.close()

#read in a set of valid lemmas from the current train data
ok_lemmas = set()
fin =  codecs.open(r'/experiment_1/src_train.txt','rb','utf-8')
for line in fin:
	ok_lemmas.add(line.strip())
fin.close()

#read in data
fin = codecs.open(r'/experiment_2/english_merged.txt','rb','utf-8')

sources = []
targets = []

for line in fin:
	parts = line.strip().split()
	lemma = parts[3]
	form = parts[4]
	vec = '<' + parts[2] + '> '
	if vec != 'V;NFIN' and ' '.join(lemma) in ok_lemmas:
		sources.append(vec + ' '.join(lemma))
		targets.append(' '.join(form))	
fin.close()

pairs = list(zip(sources,targets))
random.shuffle(pairs)

#split into train and test
train = pairs

#write the outputs
for s,t in train:
	fout_src_train.write(s + '\n')
	fout_tgt_train.write(t + '\n')

fout_src_train.close()
fout_tgt_train.close()
fout_src_valid.close()
fout_src_test.close()

## Training

In [None]:
!pip install OpenNMT-py

In [None]:
!onmt_build_vocab -config /experiment_2/config.yaml -n_sample 74700

Corpus corpus_1's weight should be given. We default it to 1 for you.
[2022-03-22 14:10:16,634 INFO] Counter vocab from 74700 samples.
[2022-03-22 14:10:16,634 INFO] Build vocab on 74700 transformed examples/corpus.
[2022-03-22 14:10:16,649 INFO] corpus_1's transforms: TransformPipe()
[2022-03-22 14:10:17,053 INFO] Counters src:43
[2022-03-22 14:10:17,054 INFO] Counters tgt:38


In [None]:
!onmt_train -config /experiment_2/config.yaml

[2022-03-22 14:10:23,315 INFO] Missing transforms field for corpus_1 data, set to default: [].
[2022-03-22 14:10:23,316 INFO] Missing transforms field for valid data, set to default: [].
[2022-03-22 14:10:23,317 INFO] Parsed 2 corpora from -data.
[2022-03-22 14:10:23,318 INFO] Get special vocabs from Transforms: {'src': set(), 'tgt': set()}.
[2022-03-22 14:10:23,319 INFO] Loading vocab from text file...
[2022-03-22 14:10:23,319 INFO] Loading src vocabulary from /content/drive/MyDrive/NLP/Project4/experiment_2/run/example.vocab.src
[2022-03-22 14:10:23,321 INFO] Loaded src vocab has 43 tokens.
[2022-03-22 14:10:23,321 INFO] Loading tgt vocabulary from /content/drive/MyDrive/NLP/Project4/experiment_2/run/example.vocab.tgt
[2022-03-22 14:10:23,322 INFO] Loaded tgt vocab has 38 tokens.
[2022-03-22 14:10:23,323 INFO] Building fields with vocab in counters...
[2022-03-22 14:10:23,323 INFO]  * tgt vocab size: 42.
[2022-03-22 14:10:23,323 INFO]  * src vocab size: 45.
[2022-03-22 14:10:23,323 I

In [None]:
#train
!onmt_translate -model /experiment_2/run/model_step_74700.pt -src /experiment_2/src_train_tagged.txt -output /experiment_2/pre_train.txt 

[2022-03-22 15:23:27,984 INFO] Translating shard 0.
  self._batch_index = self.topk_ids // vocab_size
[2022-03-22 15:23:50,623 INFO] PRED AVG SCORE: -0.0000, PRED PPL: 1.0000
[2022-03-22 15:23:50,635 INFO] Translating shard 1.
[2022-03-22 15:24:02,016 INFO] PRED AVG SCORE: -0.0000, PRED PPL: 1.0000


In [None]:
#dev
!onmt_translate -model /experiment_2/run/model_step_74700.pt -src /experiment_2/src_valid_tagged.txt -output /experiment_2/pre_valid.txt 

[2022-03-22 15:24:06,458 INFO] Translating shard 0.
  self._batch_index = self.topk_ids // vocab_size
[2022-03-22 15:24:07,433 INFO] PRED AVG SCORE: -0.0003, PRED PPL: 1.0003


In [None]:
#test
!onmt_translate -model /experiment_2/run/model_step_74700.pt -src /experiment_2/src_test_tagged.txt -output /experiment_2/pre-test.txt 

[2022-03-22 15:24:11,299 INFO] Translating shard 0.
  self._batch_index = self.topk_ids // vocab_size
[2022-03-22 15:24:12,272 INFO] PRED AVG SCORE: -0.0007, PRED PPL: 1.0007


## Analysis

In [None]:
import pandas as pd
data = pd.read_csv("/experiment_2/english_merged.txt",sep='\t',header=None)
data.columns=["lemma","form","task","IPA_l","IPA_f","label"]
print(data[:5])

print("number of regular verbs:", len(data.loc[data["label"]=="reg"]))
print("number of irregular verbs:", len(data.loc[data["label"]=="irreg"]))

  lemma     form          task IPA_l   IPA_f label
0  fawn   fawned         V;PST  fO:n   fO:nd   reg
1  fawn   fawned  V;V.PTCP;PST  fO:n   fO:nd   reg
2  fawn  fawning  V;V.PTCP;PRS  fO:n  fO:nIN   reg
3  fawn    fawns    V;3;SG;PRS  fO:n   fO:nz   reg
4  fawn     fawn        V;NFIN  fO:n    fO:n   reg
number of regular verbs: 18140
number of irregular verbs: 530


In [None]:
def DataFrame_file(filename):
  '''
  read the file and convert it to a dataframe
  '''
  with open(filename,'r') as f:
      list_file = [line.strip('\n').replace(" ","") for line in f]
  return pd.DataFrame(list_file)

def results_set(file_tgt,file_pre):
  '''
  calculate the accuracy of regular and irregular verbs in the set and the accuracy of the set, 
  and return them
  '''
  df_pre = DataFrame_file(file_pre)
  df_tgt = DataFrame_file(file_tgt)
  df_set = pd.concat([df_pre,df_tgt],axis=1)
  df_set.columns = ["prediction","IPA_f"]

  df_set_merged = pd.merge(data, df_set)
  df_set_wrong = df_set_merged.loc[df_set_merged["prediction"]!=df_set_merged["IPA_f"]]

  total_reg = len(df_set_merged.loc[df_set_merged["label"]=="reg"])
  total_ir = len(df_set_merged.loc[df_set_merged["label"]=="irreg"])
  wrong_reg = len(df_set_wrong.loc[df_set_wrong["label"]=="reg"])
  wrong_ir = len(df_set_wrong.loc[df_set_wrong["label"]=="irreg"])

  print(f"accuracy of reg verbs: {round(1-wrong_reg/total_reg,4)}")
  print(f"accuracy of irreg verbs: {round(1-wrong_ir/total_ir,4)}")
  print(f"accuracy of the set: {round(1-len(df_set_wrong)/len(df_set),4)}")

tgt_train = "/experiment_2/tgt_train_tagged.txt"
pre_train = "/experiment_2/pre_train.txt"
print("train set:")
results_set(tgt_train,pre_train)

tgt_valid = "/experiment_1/tgt_valid.txt"
pre_valid = "/experiment_2/pre_valid.txt"
print("dev set:")
results_set(tgt_valid,pre_valid)

tgt_test = "/experiment_1/tgt_test.txt"
pre_test = "/experiment_2/pre_test.txt"
print("test set:")
results_set(tgt_test,pre_test)