<a href="https://colab.research.google.com/github/Hbasgol/ibm_models/blob/master/phrase_based_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:

###################### modules/libraries #######################################
# import os -> to see the path of folders and libraries
# from google.colab import drive -> to connect google drive
# import codecs -> to save Turkish characters without problem
# import json -> to save dictionaries structured for representing tables as
#   .json documents, json.dump is used for writing and json.loads is used for
#   reading .json strings as dictionaries
# from itertools import product
# from itertools import permutations -> to find possible alignments between
#   English and Turkish phrases that structure corresponding sentences
# import numpy as np -> for simple mathematical operations such as taking absolute
#   of a number with np.abs or summing values in a list with np.sum
# import unicodedata -> for removing Turkish characters in the corpus
# from collections import Counter -> to count elements in a list, it used for
#   normalization
################################################################################
import os
from google.colab import drive
import codecs
import json
from itertools import permutations
from itertools import product
import numpy as np
import unicodedata
from collections import Counter
################################################################################


###################### connectcolab ############################################
# the function connectcolab is used to receive Google Drive documents and
#   determine the path the files are written to or read from.
################################################################################
def connectcolab():
  drive.mount('/content/drive', force_remount=True)
path="/content/drive/My Drive/Colab Notebooks/Machine Translation/IBM2-tables/"
################################################################################


###################### tokenization ############################################
# takes sentencelist such as [["first sentence"], ["second sentence"], ["third sentence"], ...]
# and op2, which is an operator to deteck whether the given sentence list
# is composed of Turkish sentences or English sentences
#
# -> returns [["first", "sentence"], ["second", "sentence"], ["third", "sentence"], ...]
#
# for the inconsistency of the corpus of Turkish sentences, Turkish characters 
# such as "ş, ğ, ç" etc. have been turned into s, g and c.
#
# this function is different than those of others implemented for IBM 1 and IBM 2
# because this does not involve NULL token, which is not important for
# phrase-based translation
################################################################################

def tokenization(sentencelist, op2):
  if op2 == "t":
    return [[*map(rm_turkish, i.split(" "))] for i in sentencelist]
  if op2 == "e":
    return [i.split(" ") for i in sentencelist]
    
################################################################################


#################### rm_turkish ################################################
# the function takes a word that is a string data type and change Turkish
# characters into English counterparts.
#
# takes uçuyorum -> returns ucuyorum
# şenlik -> senlik
#
# to remove the ambiguity of the corpus in terms of Turkish sentences
#
# the function is used in another function named tokenization
################################################################################

def rm_turkish(word):
  normalized = unicodedata.normalize('NFD', word)
  word = "".join([c for c in normalized if not unicodedata.combining(c)])
  return word

################################################################################


############################### get_sample #####################################
# the functiont takes one argument, n_sample, which determines the number of 
#   sentences which will be using in the function. The n_sample of sentences
#   are then separated into test and train sets with a ratio of 1/9
#
# the function returns english_train, turkish_train, english_test, turkish_test, 
#   english_train: english sentences the model to be trained as a list of list
#   turkish_train: turkish sentences the model to be trained as a list of list
#   english_test: english sentences the model to be tested as a list of list
#   english_test: english sentences the model to be tested as a list of list
#
# the difference between this function and those of others implemented for
#   IBM 1 and IBM 2 is that it does not find the unique words in the corpus.
#   Because, phrase-based translation does not need them.
#
# the function named connectcolab() used in this function to connect Google Drive
################################################################################

def get_sample(sample):
  connectcolab()
    
  english_list = []
  with open("/content/drive/My Drive/Colab Notebooks/Machine Translation/corpus/english.txt", "r") as english:
    for cnt, line in enumerate(english):
      english_list.append(line.rstrip())

  turkish_list = []
  with open("/content/drive/My Drive/Colab Notebooks/Machine Translation/corpus/turkish.txt", "r") as turkish:
    for cnt, line in enumerate(turkish):
      turkish_list.append(line.rstrip())
  
  english_list=english_list[:sample]
  turkish_list=turkish_list[:sample]
  
  rtrain = int(sample*0.9)

  english_list = tokenization(english_list, "e")
  turkish_list = tokenization(turkish_list, "t")
  english_train, turkish_train = english_list[:rtrain], turkish_list[:rtrain]
  english_test, turkish_test = english_list[rtrain:], turkish_list[rtrain:]
  del english_list
  del turkish_list
  print("sentences for training have been processed")
  return english_train, turkish_train, english_test, turkish_test

################################################################################


########################## save/read_t_table functions #########################
# save_phrases takes a number of phrase kept in a nested dictionary and
#   saves the dictionary as a .json format with the help of op argument
#   op argument determines the direction of phrases (English to Turkish)
#   or Turkish to English
#
# read_phrases takes an argument named op to receive the .json file involving
#   phrases as a nested dictionary. The function returns phrases as a nested
#   dictionary format
#
# save_language_model takes language_model and two operators named op and op1
#   language_model is a dictionary involving n-grams and their probabilities
#   op is given to determine the language, Turkish or English
#   op1 is given to determine the n-gram, whether bi-gram or three-gram
#   the function saves language model with a .json file format
#
# read_language_model takes two operators: op and op1. The first one is given
#   to determine the language and the second one is for determining the
#   type of n-gram, whether it is three-gram or bi-gram, or other types.
#   the function returns the corresponding language model as nested dictionary
#   format
#
# read_viterbi_alignments takes one argument: op, which determines the direction
#   of alignment, whether English to Turkish or Turkish to English and returns
#   viterbi alignments as a nested dictionary
#   Since .json data format does not support integers to be dictionary key,
#   after reading the nested dictionary, all keys that have been saved as a
#   string by the json module are turned into corresponding integers
#     example: {1: {0: 1, 1: 2, 2, 4}, 2: {0:1, 2:2, 4:6 ...} ...}
#     in which the first key is sentence index and inner key represents
#     e ---> f or f ---> e, {0: 1, 1: 2}
################################################################################

def save_phrases(phrases, op):
  with codecs.open(path+"phrases-"+op+".json", 'w', 'utf-8') as f:
    json.dump(obj=phrases, fp=f, indent=2, ensure_ascii=False)
    
def read_phrases(op):
  with codecs.open(path+"phrases-"+op+".json") as f:
    return json.loads(f.read())
    
def save_language_model(language_model, op, op1):
  with codecs.open(path+"language_model-"+op+"-"+op1+".json", 'w', 'utf-8') as f:
    json.dump(obj=language_model, fp=f, indent=2, ensure_ascii=False)
    
def read_language_model(op, op1):
  with codecs.open(path+"language_model-"+op+"-"+op1+".json") as f:
    return json.loads(f.read())
  
def read_viterbi_alignments(op):
  with codecs.open(path+"IBM2-viterbi"+op+".json") as f:
    alignments = json.loads(f.read())
    alignments_int = {int(i): {int(j): alignments[i][j] \
                               for j in alignments[i]}  \
                               for i in alignments}
  del alignments
  return alignments_int

################################################################################


############################## v_to_tuple ######################################
# The function takes a viterbi alignment (alignments_int) that can be in both
#   direction and returns a dictionary involving alignments as tuples such that
#   {1: {(0, 1), (1, 2), (2, 4), ...}, 2: {...}...}
#   {j: {(e1, f1), (e2, f2), (e3, f3), (e4, f5)}}, or vice versa
#   Since intersection and union operations are required to find possible phrases
#   the data structure of the viterbi alignments have been changed.
################################################################################

def v_to_tuple(alignments_int):
  vt2e_ps={}
  for key, val in alignments_int.items():
    vt2e_ps[key]=[]
    for key_a, val_a in val.items():
      vt2e_ps[key].append((key_a, val_a))
  return vt2e_ps

################################################################################


############################ alignment functions ###############################
# The functions are written according to pseudocodes that were given in
#   Statistical Machine Translation, written by Philipp Koehn, which is used
#   textbook of the course
#
# grow_diag_final function consists of two functions that are grow_diag and final.
#   The function takes viterbi alignments as vt2e and ve2t and aligned sentences
#   in the corpus as e_sentence as English sentence and f_sentence as Turkish
#   sentence, or vice versa, for bi-direction translation
#   The function returns alignment that will be used in phrase extraction and
#   intersect. 
#   alignment involves a set of tuples referring to alignments between sentences
#   such that alignment = set((1, 2), (2, 2), (3, 1), (1, 4))
#
# grow diag function takes neighboring points as tuples,
#   alignment which is the intersect of viterbi alignments,
#   viterbi alignments as vt2e and ve2t,
#   and two aligned sentences as e_sentence for English and f_sentence for Turkish
#   The function searches for neighboring alignments and add the alignment if it
#   is eligible
#
# final function takes viterbi alignments as vt2e and ve2t
#   alignment points taken from grow_diag as alignment
#   two aligned sentences as e_sentence and f_sentence
#   and returns possible alignments by adding new points
################################################################################

def grow_diag_final(vt2e, ve2t, e_sentence, f_sentence):
  neighboring = {(-1, 0), (0, -1), (1, 0), (0, 1),
                   (-1, -1), (-1, 1), (1, -1), (1, 1)} # neighbors, move or swap
  vt2e, ve2t = set([(x+1, y) for x, y in ve2t if y!=0]), set([(y, x+1) for x, y in vt2e if y != 0]) # removing impact of null tokens
  alignment = ve2t.intersection(vt2e)
  intersect = ve2t.intersection(vt2e)
  alignment = grow_diag(neighboring, alignment, vt2e, ve2t, e_sentence, f_sentence)
  alignment = final(vt2e, ve2t, alignment, e_sentence, f_sentence) #--> there might be a problem here, because gets all sentence
  return alignment, intersect
  
def grow_diag(neighboring, alignment, vt2e, ve2t, e_sentence, f_sentence):
  len_a=len(alignment)
  while True:
    for e_word in range(1, len(e_sentence)+1):
      for f_word in range(1, len(f_sentence)+1):
        if (e_word, f_word) in alignment:
          for e_new, f_new in [(e_word+x, f_word+y) for x, y in neighboring]:
            # conditions for adding alignment point
            if (not e_new in [i[0] for i in alignment] or
              not f_new in [i[1] for i in alignment]) and \
              ((e_new, f_new) in ve2t.union(vt2e)):
                alignment.add((e_new, f_new))
    if len_a==len(alignment): 
      return alignment # stop condition
    else:
      len_a=len(alignment)
      
def final(vt2e, ve2t, alignment, e_sentence, f_sentence):
  for e_word in range(len(e_sentence)):
    for f_word in range(len(f_sentence)):
      if (not e_word in [i[0] for i in alignment] or \
          not f_word in [i[1] for i in alignment]) and \
          (e_word, f_word) in ve2t.union(vt2e):
          alignment.add((e_word, f_word))
  return alignment

################################################################################


####################### phrase extraction functions ############################
# The functions are written for extracting phrases with the help of alignments
#   found. 
# phrase_extraction takes f_sentence, e_sentence and alignment and returns
#   possible phrases in a list named BP.
# extract is an helper function in phrase_extraction that takes specific starting
#   and ending points in f_sentence and e_sentence. If the phrase to be extracted
#   is eligible, then the function returns phrases
################################################################################

def phrase_extraction(e_sentence, f_sentence, alignment):
  BP = []
  for e_start in range(1, len(e_sentence)+1): 
    for e_end in range(e_start, len(e_sentence)+1):
      f_start, f_end = (len(f_sentence), 0)
      for (e, f) in alignment:
        if e_start <= e <= e_end:
          f_start = min(f, f_start)
          f_end = max(f, f_end)
      extracted_phrases = extract(f_start, f_end, e_start, e_end, \
                                  f_sentence, e_sentence, alignment)
      for phrase in extracted_phrases:
        BP.append(phrase)
  return BP

def extract(f_start, f_end, e_start, e_end, f_sentence, e_sentence, alignment):
  if f_end == 0:
      return []
  for (e, f) in alignment:
      if (e < e_start or e > e_end) and (f_start <= f <= f_end):
          return []
  E = []
  f_s = f_start
  while True:
      f_e = f_end
      while True:
          e_phrase = (e_start, e_end)
          f_phrase = (f_s, f_e)
          E.append((e_phrase, f_phrase))
          f_e += 1
          if f_e in list(zip(*alignment))[1] or f_e > len(f_sentence):
              break
      f_s -= 1
      if f_s in list(zip(*alignment))[1] or f_s < 1:
          break
  return E
  
################################################################################


####################### phrase_alignment #######################################
# The function takes five arguments:
#   english_train involves English sentences as a list
#   turkish_train involves Turkish sentences as a list
#   vt2e and ve2t are viterbi alignments with two directions
#   limit is an integer that determines the length of phrases to be extracted
#   op is for determining the direction of phrase alignments for saving
# The function returns a nested dictionary named ph_dict involving phrase translation
#   probabilities that have been normalized.
################################################################################

def phrase_alignment(english_train, turkish_train, vt2e, ve2t, limit, op):
  ph_dict={}
  for index_e, e_sentence in enumerate(english_train):
    f_sentence=turkish_train[index_e]
    t2e = vt2e[index_e]
    e2t = ve2t[index_e]
    alignment, intersect = grow_diag_final(t2e, e2t, e_sentence, f_sentence)
    for f_phrase, e_phrase in phrase_extraction(e_sentence, f_sentence, alignment):
      e_start, e_end = f_phrase
      f_start, f_end = e_phrase
      if (e_end-e_start)<limit and (f_end-f_start)<limit:
        if e_start == e_end:
          ep = str(e_sentence[e_start-1])
        else:
          ep = " ".join(e_sentence[e_start-1:e_end])
          ep2 = " ".join(e_sentence[e_start-1:e_end+1])
        if f_start == f_end:
          fp = str(f_sentence[f_start-1])
        else:
          fp = " ".join(f_sentence[f_start-1:f_end])
          fp2 = " ".join(f_sentence[f_start-1:f_end+1])
          
        if fp not in ph_dict:
          ph_dict[fp]={}
        if ep not in ph_dict[fp]:
          ph_dict[fp][ep]=0
        ph_dict[fp][ep]+=1 #to easily count how many phrases that english phrase belongs to
        
        if fp2 not in ph_dict:
          ph_dict[fp2]={}
        if ep2 not in ph_dict[fp2]:
          ph_dict[fp2][ep2]=0
        ph_dict[fp2][ep2]+=1 #to easily count how many phrases that english phrase belongs to
        
  for key, val in ph_dict.items():
    n=sum(list(val.values()))
    for ep in val:
      val[ep]/=n
  save_phrases(ph_dict, op)
  print("number of phrases: {}".format(len(ph_dict.keys())))
  print("phrase probabilities are found and saved!")
  return ph_dict

################################################################################


############################## db_reordering ###################################
# The function takes two arguments: x and a
#   x is for distortion
#   a is for ratio that the distortion to be punished
# The function returns a float number that the probability of translation
#   will be multiplied with
################################################################################

def db_reordering(x, a): ## d function for re-ordering model
  return a**np.abs(x)

################################################################################

def distance(f_sentence, e_sentence, fp, ep, corresp):
  pi = corresp.index((fp, ep))
  if pi == 0:
    return 0
  else:
    epi = corresp[pi]
    eindex = e_sentence.index(epi[1])
    ephrase = e_sentence[eindex-1]
    t_index=0
    for index, (fp, ep) in enumerate(corresp):
      if ephrase == ep:
        t_index=index
    if pi < t_index+1:
        corresps = corresp[pi+1:t_index]
    else:
        corresps = corresp[t_index+1:pi]
    ys = [x for x, y in corresps]
    return len(ys)
  
############################## language_model ##################################
# The function takes four arguments: n, english, op and op1
#   n is for determining the number of word in n-gram
#   english is a list of sentence
#   op and op1 is for writing en_lm which is a nested dictionary that involves
#   n-gram probabilities
#     op -> the language, English or Turkish
#     op1 -> name of n-gram such as two-gram or three-gram
# The function returns a nested dictionary that involves n-gram probabilities
#   such that
#   {...'"So': {'how': {'have': {'you': 0.12312}}, 
#   'will': {'you': {'not': 0.435345}}},...} for 4-gram language model
#   -> So how have you
#   -> So will you not
#
# The function uses an helper function named normalization, which normalizes
#   counts in the corpus
#
# Another helper function used in this function named nested_dict
#   which gets an n_gram as a list such that ["So", "how", "have", "you"]
#   and add elements of the list to the dictionary as keys
################################################################################

def language_model(n, english, op, op1): # n-gram model for language model in STM
  st_symbol="<s>" # starting symbol
  en_lm={} # dictionary for counting
  for index_e, e_sentence in enumerate(english):
    e_sentence_n=e_sentence
    for s in range(n-1):
        e_sentence_n = [st_symbol]+e_sentence_n
    for index_w, e_word in enumerate(e_sentence_n):
      if e_word != st_symbol:
        n_gram=e_sentence_n[index_w-n+1:index_w+1]
        en_lm = nested_dict(n_gram, en_lm, en_lm) # nested dictionary creating with a recursive function
        # such that it will be en_lm[w1][w2][w3] = p(w3|w1, w2) for 3-gram 
  en_lm=normalization(en_lm, n) # normalization
  save_language_model(en_lm, op, op1)
  return en_lm

################################################################################


############################### nested_dict ####################################
# The function is a recursive function that takes three arguments: 
#   key, dic and dic_all 
#   key is an n-gram as ["So", "how", "have", "you"], which is 4-gram
#   dic is the dictionary that n-grams will be added
#   dic_all is the same dictionary as dic to return the dic as a whole.
# The function returns a changed dictionary
################################################################################

def nested_dict(key, dic, dic_all):
  if len(key)==1:
    if key[0] in dic:
      dic[key[0]]+=1
    else:
      dic[key[0]]=1
  if len(key)==0:
    return dic_all
  else:
    if key[0] not in dic:
      dic[key[0]]={}
    return nested_dict(key[1:], dic[key[0]], dic_all)
  
################################################################################


############################### normalization ##################################
# The function is a recursive function that takes four arguments
#   dic is the dictionary involving n-gram counts
#   n is the number of words in n-gram
#   dic_all is a default argument that is used to copy dic
#   c is a default argument that is used to count how many key is passed
#
# The function returns n-gram probabilities as a nested dictionary, which means
#   that counts are turned into probabilities
################################################################################

def normalization(dic, n, dic_all=[], c=0):
  if c == 0:
    dic_all=dic
  if (n-1) == c:
    nm=np.sum(list(dic.values()))
    for k, v in dic.items():
      dic[k]/=nm
  for key, val in dic.items():
    if (n-1) != c:
      normalization(dic[key], n, c=c+1)
  return dic_all

################################################################################


############################### reach_n_gram ###################################
# The function takes two arguments: key and dic
#   key is a list involving n-grams such that [<s>, <s>, "Who"] as 3-gram
#   dic is the dictionary involving n-grams
# The fuction returns the probability of n-gram
################################################################################

def reach_n_gram(key, dic):
  if type(dic)!=dict:    # if the function reaches the probability (float)
    return dic
  if not key[0] in dic:   # if the word could not found in N-gram, return 0 
    return 0
  return 0+reach_n_gram(key[1:], dic[key[0]])

################################################################################
  
  
############################### language_model_prob ############################
# The function takes three arguments: e_sentence, n and lm_en
#   e_sentence is an English sentence
#   n is the number of word in n-gram
#   lm_en is a nested dictionary involving n-grams and correspoding probabilities
#
# The function returns probability of language model for e_sentence
#
# The function firstly splits e_sentence into words and finds possible n-grams
#   and probabilities of these n-grams are multiplied each other to find
#   the probability of language model for e_sentence
#
# An helper function named reach_n_gram is used in this function, which returns
#   probability of an n-gram by searching a nested dictionary, which is lm_en
#   in this case
################################################################################

def language_model_prob(e_sentence, n, lm_en, info=False):
  
  # since the english sentence will be given manually,
  # this does not affect argmax
  # however, it is a valuable component of the standard model
  # it can be used to assess the fluency of english sentences
  
  if type(e_sentence) == str:
    e_sentence = [rm_turkish(e_sentence)]
 
  st_symbol="<s>"
  ew_sentence=sum([w.split(" ") for w in e_sentence], [])
  e_sentence_n=ew_sentence
  for s in range(n-1):
    e_sentence_n = [st_symbol]+e_sentence_n
  lm_p=1
  for index_w, e_word in enumerate(e_sentence_n):
    if e_word != st_symbol:
      n_gram=e_sentence_n[index_w-n+1:index_w+1]
      if info:
        print(n_gram, reach_n_gram(n_gram, lm_en))
      lm_p*=reach_n_gram(n_gram, lm_en) 
  return lm_p

################################################################################


############################### translation ####################################
# The function implements the standard model of phrase-based translation
# The function takes eight arguments:
#   f_sentence and e_sentence are aligned sentences in the corpus
#   they are structed as a combination of phrases in a list of list
#   ph_dict is a nested dictionary involving phrase-translation probabilities
#   n is the number of word in the n-gram, language model
#   gram_en is a nested dictionary involving n-gram probabilities
#   a is for distortion-reordering
#
# The function firsty finds language model probability for e_sentence
#   if the language model probability is 0, then it returns a blank dictionary
#   and the probability of f sentence to be translated as e_sentence as 0
# If the function finds a value for probability of language model, it generates
#   possible alignments between phrases.
# It runs over possible alignments and for each alignment it creates a probability.
#   by multiplying it with probability of language model.
# After then, it returns all alignments (correspondances) with respective values
#   referring to their probabilities and probability of f_sentence to be
#   translated into e_sentence
################################################################################

def translation(f_sentence, e_sentence, ph_dict, n, gram_en, a):
  # f_sentence and e_sentence are structured as phrases in list of list
  # all possible correspondences are considered
  corresp_dic={}
  f_sentence = [rm_turkish(i) for i in f_sentence]
 
  lm_p = language_model_prob(e_sentence, n, gram_en, info=True) # language model
  if lm_p <= 0: # if probability of language model is 0, then all is 0
    print("language model probability for {} is 0".format(e_sentence))
    return {}, 0
  for perm in permutations(e_sentence):
    corresp = list(zip(f_sentence, perm))
    p=1
    for fp, ep in corresp:
      x = distance(f_sentence, e_sentence, fp, ep, corresp)
      if not fp in ph_dict: # to check whether the phrase is in the phrase translation table
        pfe=0 # if not, makes it 0
      else:
        if not ep in ph_dict[fp]:
          pfe=0 
        else:
          pfe=ph_dict[fp][ep]
      p*=(pfe*db_reordering(int(x), a)) # probability of correspondence without language model
    p*=lm_p # add language model
    corresp_dic[str(corresp)]=p # probability of correspondance
  sump=np.sum(list(corresp_dic.values())) # find sum of all
  for key in corresp_dic:
    if sump!= 0:
      corresp_dic[key]/=sump # normalize
      
  non_zero = {}
  for key, val in corresp_dic.items():
    if val != 0:
      non_zero[key]=val
  
  if len(non_zero) == 0:
    print("! all alignments are zero")
    
  return non_zero, sump

################################################################################


############################### examples #######################################
# The function involves examples for testing
################################################################################

def examples():
  en_full = [["I am going to go to the school"],
             ["I will be there, just a minute"], 
             ["Because, today is a good day"],   
            ["This book is red, that table is blue"]]
  tr_full = [["Okula gideceğim"], 
            ["Birkaç dakika içinde orada olacağım"], 
            ["Çünkü, bugün güzel bir gün"], 
            ["Bu kitap kırmızı şu masa ise mavi"]]
  
  tr_full = tokenization(sum(tr_full, []), "t")
  
  en_phrase = [["I", "am going to", "go to the", "school"],
              ["I am", "going to go to", "the school"],
              ["I", "will be there,", "just a minute"],
              ["I will be there", "just a minute"],
              ["Because,", "today is", "a good day"],
              ["Because, today is", "a good day"],
              ["This book", "is red,", "that table", "is blue"],
              ["This", "book", "is red", "that", "table", "is blue"]]
  
  tr_phrase = [["Okula", "gidecegim"], ["Okula gidecegim"],
              ["Birkac dakika icinde", "orada olacagım"], 
              ["Birkac dakika", "icinde", "orada olacagım"],
              ["Cunku,", "bugun", "guzel bir gun"],
              ["Cunku, bugun", "guzel bir gun"],
              ["Bu kitap", "kırmızı", "su masa ise", "mavi"],
              ["Bu kitap kırmızı", "su masa ise mavi"]]
  
  en_phrases_exp = ["I am going to", "of course", "because of", "This book", "it have been otherwise"]
  tr_phrases_exp = ["Cunku", "Elli yaslarında", "bilinmiyor", "sahtekarlık etmem.", "bunu size emrediyorum."]
  
  en_gram_test=[["<s>", "<s>", "Because"],
              ["<s>", "A", "great"],
              ["as", "it", "is"],
              ["going", "to", "go"],
              ["Who", "are", "you"]]
  
  tr_gram_test=[["<s>", "<s>", "Ben"],
               ["<s>", "Bu", "kitap"],
               ["Bu", "bir", "masa"],
               ["hayli", "zaman", "oldu"],
                ["o", "kim", "idi"]]
  
  return en_full, tr_full, en_phrase, tr_phrase, en_phrases_exp, tr_phrases_exp, en_gram_test, tr_gram_test

################################################################################


############################### return_model ###################################
# The function returns 
#   viterbi alignments as vt2e and ve2t
#   language model for English n-grams as lm_en
#                  for Turkish n-grams as lm_tr
#   and phrase translation probabilities as ph_dict
#
# The tables are received from respective paths in the drive
################################################################################

def return_model():
  # data type of viterbi alignments are changed with v_to_tuple
  # normally, they are structured as nested dictionaries
  # however, to find intersection and union, they should be changed to be
  # (e1, f1), (e2, f2) etc.
  vt2e = v_to_tuple(read_viterbi_alignments("t2e"))
  ve2t = v_to_tuple(read_viterbi_alignments("e2t")) 
  lm_en = read_language_model("english", "two-gram")
  lm_tr = read_language_model("turkish", "two-gram")
  ph_dict = read_phrases("t2e")
  return vt2e, ve2t, lm_en, lm_tr, ph_dict

################################################################################

def train(n_sample, n_gram, phrase_limit, vt2e, ve2t, key):
  n=n_gram #n-gram
  english_train, turkish_train, english_test, turkish_test = get_sample(n_sample) # get sample for phrase probabilities
 
  vt2e = v_to_tuple(vt2e) # get viterbi alignments from drive
  ve2t = v_to_tuple(ve2t) # get viterbi alignments from drive
  
  # finding phrase alignments and saving them as a .json document 
  # limit: the limit length of phrases to be extracted
  ph_dict = phrase_alignment(english_train, turkish_train, vt2e, ve2t, limit=phrase_limit, op="t2e"+str(key))
  
  # finding n-gram probabilities
  en_lm = language_model(n, english_train, "english", str(n_gram)+"-gram"+str(key))
  tr_lm = language_model(n, turkish_train, "turkish", str(n_gram)+"-gram"+str(key))
  
  return vt2e, ve2t, en_lm, tr_lm, ph_dict

In [0]:
vt2e = read_viterbi_alignments("t2e")

In [0]:
ve2t = read_viterbi_alignments("e2t")

In [0]:
vt2e, ve2t, en_lm, tr_lm, ph_dict = train(180000, 2, 20, vt2e, ve2t, "")

Mounted at /content/drive
sentences for training have been processed
number of phrases: 610256
phrase probabilities are found and saved!
