<a href="https://colab.research.google.com/github/Hbasgol/ibm_models/blob/master/IBM_Model2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:

###################### modules/libraries #######################################
#
# import os -> to see the path of folders and libraries
#
# import time -> to learn the timing of each step of expectation-maximization
#
# from google.colab import drive -> to connect google drive
#
# import codecs -> to save Turkish characters without problem
#
# import json -> to save dictionaries structured for representing tables as
#   .json documents, json.dump is used for writing and json.loads is used for
#   reading .json strings as dictionaries
#
# from itertools import product
# from itertools import permutations -> to find possible alignments between
#   English and Turkish phrases that structure corresponding sentences
#
# import numpy as np -> for simple mathematical operations such as taking absolute
#   of a number with np.abs or summing values in a list with np.sum
#
# import unicodedata -> for removing Turkish characters in the corpus
#
################################################################################

import os
import time
from google.colab import drive
import codecs
import json
from itertools import permutations
from itertools import product
import numpy as np
import unicodedata


################################################################################


###################### connectcolab ############################################
# the function connectcolab is used to receive Google Drive documents and
#   determine the path the files are written to.
################################################################################
def connectcolab():
  drive.mount('/content/drive', force_remount=True)
path = "/content/drive/My Drive/Colab Notebooks/Machine Translation/IBM2-tables/"
corpus_path = "/content/drive/My Drive/Colab Notebooks/Machine Translation/corpus/"
connectcolab()
################################################################################


###################### tokenization ############################################
# takes sentencelist such as [["first sentence"], ["second sentence"], ["third sentence"] ...] 
#   two operators named op1 and op2
#   op1 is used to determine the translation direction whether English to Turkish
#    or Turkish to English, because IBM 1 and IBM 2 should be run bi-directional
#    to get word-alignments for phrase extraction in phrase-based translation
#   op2 is used to determine the language of sentences because Turkish characters
#    are removed due to the inconsistency in the corpus. For removing Turkish
#    characters, a helper function named rm_turkish is used
# The function
#  returns [["first", "sentence"], ["second", "sentence"], ["third", "sentence"] ...]
#  or
#  returns  [["NULL", "first", "sentence"], ["NULL", "second", "sentence"] ...]
################################################################################

def tokenization(sentencelist, op1, op2):
  if op1 == "t2e":
    if op2 == "t":
      return [["NULL"] + [*map(rm_turkish, i.split(" "))] for i in sentencelist]
    if op2 == "e":
      return [i.split(" ") for i in sentencelist]
  if op1 == "e2t":
    if op2 == "t":
      return [[*map(rm_turkish, i.split(" "))] for i in sentencelist]
    if op2 == "e":
      return [["NULL"] + i.split(" ") for i in sentencelist]
    
################################################################################


#################### rm_turkish ################################################
# the function takes a word that is a string data type and change Turkish
# characters into English counterparts.
#
# takes uçuyorum -> returns ucuyorum
# şenlik -> senlik
#
# to remove the ambiguity of the corpus in terms of Turkish sentences
#
# the function is used in another function named tokenization
################################################################################

def rm_turkish(word):
  normalized = unicodedata.normalize('NFD', word)
  word = "".join([c for c in normalized if not unicodedata.combining(c)])
  return word

################################################################################



###################### get_words ###############################################
# get_words takes two arguments: english_list_train as target turkish_list_train as
#    source sentence, or vice versa, which does not affect code
#    such as [["first", "sentence"], ["second", "sentence"], ["third", "sentence"], ...]
#
# get_words returns unique words as two arguments 
#    such as [["first"], ["sentence"], ["second"], ["third"]]
################################################################################

def get_words(BU_sentences):
  wordset=set()
  for sentence in BU_sentences:
    for word in sentence:
      wordset.add(word)
  return list(wordset)

################################################################################
 

########################## save/read_t_table functions #########################
# save_t_table takes a specific dictionary, involving t_tables and writes it to
#    the disk as a .json file with the name of "IBM2-t_table.json"
# read_t_table does not take an argument, but reads "IBM2-t_table.json" as a
#    dictionary for further use
# save_alignments takes a dictionary, involving alignment probabilities writes
#    it to the disk as a .json file with the name of "IBM2-alignments"
# read_alignments does not take an argument, but reads "IBM2-alignments.json" 
#    as a dictionary for further use
# save_viterbi_alignments takes a dictionary, max alignments and writes it to
#    the disk with the name of "IBM2-viterbi.json" to use viterbi alignments
#    in phrase-based translation model
# 
# Each function has an operator named op to determine the direction of translation
#   op can be "t2e" for Turkish to English and "e2t" for English to Turkish
#   translation
################################################################################

def save_t_table(t_tables, op):
  with codecs.open(path+"IBM2-t_table"+op+".json", 'w', 'utf-8') as f:
    json.dump(obj=t_tables, fp=f, indent=2, ensure_ascii=False)
    
def read_t_table(op):
  path = "/content/drive/My Drive/Colab Notebooks/Machine Translation/IBM1-tables/"
  with codecs.open(path+"IBM1-t_table"+op+".json") as f:
    return json.loads(f.read())

def save_alignments(alignments, op):
  with codecs.open(path+"IBM2-alignments"+op+".json", 'w', 'utf-8') as f:
    json.dump(obj=alignments, fp=f, indent=2, ensure_ascii=False)
    
def read_alignments(op):
  with codecs.open(path+"IBM2-alignments"+op+".json") as f:
    alignments = json.loads(f.read())
  alignments_int = {int(i): {int(j): {int(le): {int(lf): \
                          alignments[i][j][le][lf] \
                           for lf in alignments[i][j][le]} \
                           for le in alignments[i][j]} \
                           for j in alignments[i]} \
                           for i in alignments}
  del alignments
  return alignments_int
  
def save_viterbi_alignments(max_alignments, op):
  with codecs.open(path+"IBM2-viterbi"+op+".json", "w", 'utf-8') as f:
    json.dump(obj=max_alignments, fp=f, indent=2, ensure_ascii=False)
    
################################################################################


############################## get_t_table #####################################
# the function takes two arguments
#    english_list_train, turkish_list_train, turkish_tk_word)
#    english_list_train: english sentences such as
#       [["first", "sentence"], ["second", "sentence"], ["third", "sentence"], ...]
#    turkish_list_train: turkish sentences, respectively
#
# the function, throughout the execution, creates six tables as dictionaries
#    count_tables, total_tables, s_totals, alignments, count_alignments, 
#    total_alignments, normally, algorithm loops on all english and turkish 
#    words, however, for memory issues, sentence matches are considered here
#    all respective values are added respective dictionaries
#    -alignments table is initialized uniformly with 1/lf+1, where lf is 
#        the length of foreign sentence f, turkish in this case.
#    -count_tables, total_tables, s_totals, alignments, count_alignments, 
#        total_alignments are set to 0.
#
# the function returns one arguments,
#    tablelist: the list of tables mentioned above
#
#    t_table is a dictionary like
#    t_table = {"Hello":
#                {Book: 0.32}
#                {Furniture: 0.12} ...}
#    0.32 by which is received t_table["Hello"]["Book"] count_tables is same
#
#    s_totals is like
#    s_totals = {"Hello": 0.218,
#                "Furniture": 0.45 ...} total tables is same
#
#    alignments is a dictionary like
#    (i, j, le, lf) --> set((1, 2, 5, 7), (1, 2, 8, 5).....)
#    alignments = {i: {j: {le: {lf: value} ...} ...} ...}
#               --> {1: {2: {5: {7: value} ...} ...} ...}
#    which values can be received by alignments[1][2][5][7]
# other alignment tables are structured like the alignments table
# such as total alignments, one can receive 
# total alignments = {j: {le: {lf: value} ...} ...}
#                totalalignments[j][le][lf]
################################################################################

def get_t_table(english_list_train, turkish_list_train):
  #empty dictionaries are genereated
  count_tables, total_tables, s_totals, alignments, count_alignments, total_alignments = {}, {}, {}, {}, {}, {}
  for index_e, e_sentence in enumerate(english_list_train): # for each english sentence
    f_sentence = turkish_list_train[index_e] # determine foreign sentence with respective index
    le = len(e_sentence)
    lf = len(f_sentence)
    for j, e_word in enumerate(e_sentence):
      if j not in total_alignments:
        total_alignments[j]={}
      if le not in total_alignments[j]:
          total_alignments[j][le]={}
      total_alignments[j][le][lf]=0 # creating nested dictionaries
      if e_word not in count_tables:
        count_tables[e_word]={}
      s_totals[e_word]=0
      for i, f_word in enumerate(f_sentence):
        if i not in alignments:
          alignments[i]={}
          count_alignments[i]={}
        if j not in alignments[i]:
          alignments[i][j]={}
          count_alignments[i][j]={}
        if le not in alignments[i][j]:
          alignments[i][j][le]={}
          count_alignments[i][j][le]={}
        alignments[i][j][le][lf]=1/lf+1
        count_alignments[i][j][le][lf]=0 # creating nested dictionaries
        total_tables[f_word]=0
        count_tables[e_word].update({f_word: 0}) # creating nested dictionaries
        
  tablelist = [count_tables, total_tables, s_totals, alignments, count_alignments, total_alignments]
  print("tables have been created and saved")
  return tablelist

################################################################################


############################## to_zero #########################################
# the function takes four arguments, which are tables structured as nested
#   dictionaries. The tables are 
#   count_tables, total_tables, count_alignments, total_alignments
#
# it changes the values of count_tables, total_tables, count_alignments, 
#   total_alignments to be 0, which is a step of expectation_maximization.
#
# Then, it returns these tables as a list:
#   [count_tables, total_tables, s_totals, count_alignments, total_alignments] 
################################################################################

def to_zero(count_tables, total_tables, count_alignments, total_alignments):
  for e_word, val in count_tables.items():
    for f_word, value in val.items():
      total_tables[f_word]=0
      count_tables[e_word][f_word]=0
  for i, iv in count_alignments.items():
    for j, jv in iv.items():
      for le, lev in jv.items():
        for lf, lfv in lev.items():
          count_alignments[i][j][le][lf]=0
          total_alignments[j][le][lf]=0
  return [count_tables, total_tables, count_alignments, total_alignments]

################################################################################


############################### get_sample #####################################
# The function takes one argument, n_sample, which determines the number of 
#   sentences which will be used. The n_sample of sentences
#   are then separated into test and train sets with a ratio of 1/9
#
# The function returns english_train, turkish_train, english_test, turkish_test, 
#   english_word, turkish_word
#   english_word, turkish_word: unique words in the corpus
#   english_train: english sentences the model to be trained as a list of list
#   turkish_train: turkish sentences the model to be trained as a list of list
#   english_test: english sentences the model to be tested as a list of list
#   english_test: english sentences the model to be tested as a list of list
#
# The function named connectcolab() used in this function to connect Google Drive
# The function named tokenization() tokenizes sentences
# The function named get_words() is used to get unique words
################################################################################

def get_sample(sample, op1):
  connectcolab()
    
  english_list = []
  with open(corpus_path+"english.txt", "r") as english:
    for cnt, line in enumerate(english):
      english_list.append(line.rstrip())

  turkish_list = []
  with open(corpus_path+"turkish.txt", "r") as turkish:
    for cnt, line in enumerate(turkish):
      turkish_list.append(line.rstrip())
  
  english_list=english_list[:sample]
  turkish_list=turkish_list[:sample]
  
  rtrain = int(sample*0.9)

  english_list = tokenization(english_list, op1, "e")
  turkish_list = tokenization(turkish_list, op1, "t")
  english_word = get_words(english_list)
  turkish_word = get_words(turkish_list)
  english_train, turkish_train = english_list[:rtrain], turkish_list[:rtrain]
  english_test, turkish_test = english_list[rtrain:], turkish_list[rtrain:]
  del english_list
  del turkish_list
  print("sentences for training have been processed")
  return english_train, turkish_train, english_test, turkish_test, english_word, turkish_word

################################################################################


############################ viterbi_alignment #################################
# a part of the application project is implementing a phrase-based model
#    to find the phrases, corpus should be word-aligned; to do this,
#    viterbi algorithm should be run in expectation maximization algorithm
#
# viterbi_alignment takes english_train, turkish_train, t_tables, alignments, op
#    english_train: list of English sentences structured as list of list
#    turkish_train: list of Turkish sentences structured as list of list
#    t_tables: a dictionary involving translation probabilities
#    alignments: a dictionary involving alignment probabilities
#    op: an operator for determining the translation direction and saving
#      it can be "t2e" or "e2t"
#
# the function does not return a variable but saves viterbi alignments as a
#    .json file, with the help of save_viterbi_alignments() function
################################################################################

def viterbi_alignment(english_train, turkish_train, t_tables, alignments, op):
  total_a = {}
  for e_index, e_sentence in enumerate(english_train):
    f_sentence = turkish_train[e_index]
    le = len(e_sentence)
    lf = len(f_sentence)
    max_a = {}
    for j, e_word in enumerate(e_sentence):
      current_max = (0, -1)
      for i, f_word in enumerate(f_sentence):
        t_table = t_tables[e_word][f_word]
        alignment = alignments[i][j][le][lf]
        val = t_table*alignment
        if current_max[1] < val:
          current_max = (i, val)
        max_a.update({j: current_max[0]})
    total_a[e_index]=max_a
  save_viterbi_alignments(total_a, op)
  return total_a
  
################################################################################


############################ perplexity ########################################
# The function takes four arguments:
#   english_train: English sentences
#   turkish_train: Turkish sentences
#   t_tables: translation probability table, word-based
#   alignments: alignments
#
# It returns perplexity, which is a float number representing how well the
#   probabilities that have been found are suited for the data
#   In each iteration of the expectation maximization algorithm, perplexity
#   reduces. If it does not change through iterations, this means that 
#   the algorithm converged.
#
# This function does not compute the whole perplexity formula, because it is
#   computationally inefficient, rather than computing the translation probability
#   of a sentence, it looks argmax for each word. Although the computation
#   is not same, it gives an idea regarding the convergence of the algorithm
################################################################################

def perplexity(english_train, turkish_train, t_tables, alignments):
  s=0
  for index_e, e_sentence in enumerate(english_train):
    f_sentence = turkish_train[index_e]
    le=len(e_sentence)
    lf=len(f_sentence)
    try:
      s+=np.log(max([t_tables[e_word][f_word] \
                 *alignments[i][j][le][lf] \
                 for j, e_word in enumerate(e_sentence) \
                 for i, f_word in enumerate(f_sentence) \
                 if le in alignments[i][j]              \
                 if lf in alignments[i][j][le]]))
    except:
      pass
  return -s

################################################################################


###################### expectation_maximization ################################
# The function takes english_word, turkish_word, english_train, turkish_train, tablelist, 
#   t_tables and op
#   english_word: a list of english words in the corpus
#   turkish_word: a list of turkish words in the corpus
#   english_train: english sentences the model to be trained as a list of list
#   turkish_train: turkish sentences the model to be trained as a list of list
#   tablelist: is a list of dictionaries created by get_t_table
#      count_tables, total_tables, s_totals, alignments, count_alignments, 
#      total_alignments
#   t_tables: is a dictionary, which involves translation probabilities
#   alignment_set: is a dictionary involving all alignments in the data set
#
# The function returns translation probabilities named t_table, 
#   alignment probabilities named alignments, which will be used to test the
#   model
#
# Throughout the execution, the function looks at perplexity and saves
#   t_tables, alignments and max_alignments (viterbi alignments) as .json
#   file format.
################################################################################

def expectation_maximization(english_word, turkish_word, english_train, turkish_train, tablelist, t_tables, op):
  [count_tables, total_tables, s_totals, alignments, count_alignments, total_alignments] = tablelist
  count = 1                                      # to count how many steps taken
  epsilon = 1                                    # to determine convergence, tolerance
  lastval = 10
  count_p=0
  s=0
  print("number of en sentences {}, tr sentences {}, en words {}, tr words {}".format(len(english_train), len(turkish_train), len(turkish_word), len(english_word)))
  while True:
    print("✖ step {}".format(count))
    start = time.time()                          # for timing each step, start
    if count > 1:
      [count_tables, total_tables, count_alignments, total_alignments] \
      = to_zero(count_tables, total_tables, count_alignments, total_alignments)
    for index_e, e_sentence in enumerate(english_train):
      f_sentence = turkish_train[index_e]
      le = len(e_sentence)
      lf = len(f_sentence)
      for j, e_word in enumerate(e_sentence):
        s_totals[e_word]=0
        for i, f_word in enumerate(f_sentence):
          s_totals[e_word]+=t_tables[e_word][f_word]*alignments[i][j][le][lf]
      for j, e_word in enumerate(e_sentence):  
        for i, f_word in enumerate(f_sentence):
          c = t_tables[e_word][f_word]*alignments[i][j][le][lf]/s_totals[e_word]
          count_tables[e_word][f_word]+=c
          count_alignments[i][j][le][lf]+=c
          total_tables[f_word]+=c
          total_alignments[j][le][lf]+=c
    t_tables.update({e_word: {f_word:\
                         count_tables[e_word][f_word]/total_tables[f_word]
                         for f_word in t_tables[e_word]}\
                         for e_word in t_tables})
    alignments.update({i: {j: {le: {lf: \
                              count_alignments[i][j][le][lf] \
                              /total_alignments[j][le][lf] \
                               for lf in alignments[i][j][le]} \
                               for le in alignments[i][j]} \
                               for j in alignments[i]} \
                               for i in alignments})
    
    rv_table, t_tables = normalize(t_tables)
    del rv_table
    rv_alignments, alignments = normalize_alignments(alignments)
    del rv_alignments
    s = perplexity(english_train, turkish_train, t_tables, alignments)
    print("perplexity: {}, difference in perplexity: {}".format(s, s-lastval))
    
    if count % 3 == 0:
      max_alignment = viterbi_alignment(english_train, turkish_train, t_tables, alignments, op)
      save_t_table(t_tables, op)
      save_alignments(alignments, op)
      
    if s-lastval >= 0:
      max_alignment = viterbi_alignment(english_train, turkish_train, t_tables, alignments, op)
      save_t_table(t_tables, op)
      save_alignments(alignments, op)
      break
  
    lastval=s
    count+=1
    end = time.time()
    print("execution time for one step: {}".format(end-start))
  return t_tables, alignments, max_alignment, rv_table, rv_alignments

################################################################################


################################ prob_al #######################################
# The function takes five arguments:
#   t_tables, alignments, t_sentence as a Turkish sentence
#   e_sentence as an English sentence, op an operator to determine the direction
#   of translation.
#
# The function finds all probable alignments between source and target sentences
#   and returns a normalized probability for each of them with a nested dictionary
# Additionally, it sums up probabilities of all probable alignments to find
#   the translation probability of source and target sentence
################################################################################

def prob_al(t_tables, alignments, t_sentence, e_sentence, op):
  t_sentence = tokenization([t_sentence], op, "t")[0]
  e_sentence = tokenization([e_sentence], op, "e")[0]
  if op=="t2e":         # table is t[e_word][f_word] or t[target][source]
    source = t_sentence # determine source and target sentences
    target = e_sentence
  else:
    source = e_sentence
    target = t_sentence

  print("source sentence:", source)
  print("target sentence:", target)

  al=[]
  for t in [" "]+target: # source can be not-aligned
    wal=[]
    for s in source:
      wal.append([s, t]) # probable alignments
    al.append(wal)
    
  ali={}
  ind=0

  for a in product(*al): # find sentence alignments
                                   # target should have one alignment
    if len(set([l[1] for l in a])) == len(target)+1:
      p_a=1
      for l in a:
        if l[1]!=" ":
          # translation and alignment prob.
          if l[1] in t_tables and l[0] in t_tables[l[1]]:
            try:
              alp = alignments[source.index(l[0])][target.index(l[1])][len(source)][len(target)]
              if alp == 0:
                alp=1e-50 # if the probability of alignment is 0, just show it
            except:
              alp=1e-10 # if it cannot find the alignment from the corpus
            p_a*=(t_tables[l[1]][l[0]]*alp)
          else:
            p_a*=0
      if p_a!=0:
        ali[ind]=[a, p_a]
    ind+=1

  p_sum=0
  for key in ali:
    p_sum+=ali[key][1] # sum over probabilities
    
  for key in ali:
    if p_sum!=0:
      ali[key][1]/=p_sum # normalization
  
  values = list(set([ali[key][1] for key in ali]))
  values.sort(reverse = True)
  
  sortedali = {}
  for val in values[:2]:
      for key in ali:
          if ali[key][1] == val:
              sortedali[key]=ali[key]
   
  return sortedali, p_sum
  
################################################################################


################################ prob_word #####################################
# The function takes three arguments
#   t_table as a t_table
#   word as a word in the corpus
#   number is the maximum n probable words for word
#
# The function returns a list of tuples
#   such that 
#   maxtuplist=[(ev, house, 0.5), 
#               (ev, home, 0.4), 
#               (ev, book, 0.001) ...]
################################################################################

def prob_word(t_table, word, n):
  tuplist=[]
  maxtuplist=[]
  for t_word, t_values in t_table.items():
    for s_word, s_values in t_values.items():
      if word == s_word:
        tuplist.append((s_word, t_word, s_values))
  values=sorted([i[2] for i in tuplist])[:n-1]
  for val in values:
    for tup in tuplist:
      if tup[2]==val:
        maxtuplist.append(tup)
  return maxtuplist
################################################################################


############################### examples #######################################
# The function involves examples for testing
################################################################################

def examples():
  en_full = [["I am going to go to the school"],
             ["I will be there, just a minute"], 
             ["Because, today is a good day"],   
            ["This book is red, that table is blue"]]
  tr_full = [["Okula gidecegim"], 
            ["Birkac dakika icinde orada olacagım"], 
            ["Cunku, bugun guzel bir gun"], 
            ["Bu kitap kırmızı su masa ise mavi"]]

  en_word = [["table"], ["black"], ["computer"], ["So"], 
             ["slim"], ["paper"], ["team"], ["accomplishment"]]
  
  tr_word = [["masa"], ["siyah"], ["bilgisayar"], ["Yani"], 
             ["ince"], ["kagıt"], ["takım"], ["basarı"]]
  
  return en_full, tr_full, en_word, tr_word

################################################################################


############################### return_model ###################################
# The function returns 
#   t_tables and alignments
#
# The tables are received from respective paths in the drive
################################################################################

def return_model(op):
  t_tables = read_t_table(op)
  alignments = read_alignments(op)
  return t_tables, alignments

################################################################################

def table_rv(t_tables):
  rv_table={}
  for key, val in t_tables.items():
    for k2, v2 in val.items():
      if k2 not in rv_table:
        rv_table[k2]={}
      rv_table[k2][key]=v2
  return rv_table

def normalize(t_table):
  rv_table = table_rv(t_table)
  for f, e in rv_table.items():
    n = sum(list(e.values()))
    for ew in e:
      e[ew]/=n
  for e, f in t_table.items():
    for f_word in f:
      t_table[e][f_word]=rv_table[f_word][e]
  return rv_table, t_table

def normalize_alignments(alignments):
  rv_alignments={}
  for i in alignments:
    for j in alignments[i]:
      if j not in rv_alignments:
        rv_alignments[j]={}
      for le in alignments[i][j]:
        if le not in rv_alignments[j]:
          rv_alignments[j][le]={}
        for lf, val in alignments[i][j][le].items():
          if lf not in rv_alignments[j][le]:
            rv_alignments[j][le][lf]={}
          rv_alignments[j][le][lf][i]=val
  for j in rv_alignments:
    for le in rv_alignments[j]:
      for lf in rv_alignments[j][le]:
        n = sum(list(rv_alignments[j][le][lf].values()))
        for i in rv_alignments[j][le][lf]:
          if n != 0:
            rv_alignments[j][le][lf][i]/=n
  for i in alignments:
    for j in alignments[i]:
      for le in alignments[i][j]:
        for lf in alignments[i][j][le]:
          alignments[i][j][le][lf]=rv_alignments[j][le][lf][i]
  return rv_alignments, alignments


def continue_training(op):
  n_sample=180000 # the sample was 180000, run until convergence
  english_train, turkish_train, english_test, \
                 turkish_test, english_word, turkish_word = get_sample(n_sample, op)
    
  if op == "e2t":
    tablelist = get_t_table(turkish_train, english_train) #form tables
    tablelist[3] = read_alignments(op) #read alignments having been saved
    t_tables = read_t_table(op) # read t table having been saved

    t_tables, alignments, max_alignment, rv_table, rv_alignments = expectation_maximization(turkish_word, english_word, \
                                                    turkish_train, english_train, \
                                                    tablelist, t_tables, op)
  elif op=="t2e":
    tablelist = get_t_table(english_train, turkish_train) #form tables
    tablelist[3] = read_alignments(op) #read alignments having been saved
    t_tables = read_t_table(op) # read t table having been saved

    t_tables, alignments, max_alignment, rv_table, rv_alignments = expectation_maximization(english_word, turkish_word, \
                                                    english_train, turkish_train, \
                                                    tablelist, t_tables, op)
    
  return t_tables, alignments, max_alignment, rv_table, rv_alignments
  
def train(n_sample, d, t, t_tables):
  english_train, turkish_train, english_test, turkish_test, english_word, turkish_word = get_sample(n_sample, t)
  print("Number of English Sentence {}, Turkish Sentence {}".format(len(english_train), len(turkish_train)))
  print("Number of English Word {}, Turkish Word {}".format(len(english_word), len(turkish_word)))
  if t == "t2e":
    tablelist = get_t_table(english_train, turkish_train)
    t_tables, alignments, max_alignment, rv_table, rv_alignments = expectation_maximization(english_word, turkish_word, \
                                                  english_train, turkish_train, \
                                                  tablelist, t_tables, d)
  elif t == "e2t":
    tablelist = get_t_table(english_train, turkish_train)
    t_tables, alignments, max_alignment, rv_table, rv_alignments = expectation_maximization(english_word, turkish_word, \
                                                  english_train, turkish_train, \
                                                  tablelist, t_tables, d)
  return t_tables, alignments, max_alignment, rv_table, rv_alignments

In [0]:
op = "e2t"
n_sample=180000
english_train, turkish_train, english_test, \
                 turkish_test, english_word, turkish_word = get_sample(n_sample, op)

In [0]:
tablelist = get_t_table(turkish_train, english_train) #form tables
tablelist[3] = read_alignments(op) #read alignments having been saved
t_tables = read_t_table(op) # read t table having been saved

t_tables, alignments, max_alignment, rv_table, rv_alignments = expectation_maximization(turkish_word, english_word, \
                                                turkish_train, english_train, \
                                                tablelist, t_tables, op+"-with-old")