In [126]:
from io import open
import unicodedata
import string
import re
import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
from distance import levenshtein
import os
import math
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils import data
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import import_ipynb
import G2P_LSTM as g2p

data_file = 'TTScorpora.tsv'

In [127]:
cfg = g2p.cfg
def dict_sorting (dict_file_name):
    dict_file = open(dict_file_name, 'r')
    lines_dict = dict_file.readlines()
    dict_file.close()

    graphemes = []
    phonemes = []

    for i in range(0, len(lines_dict)):
        lines_dict[i] = lines_dict[i].split()
        graphemes.append([*lines_dict[i][0]])
        phonemes.append(lines_dict[i][1:])
    phonemes = [a for b in phonemes for a in b]
    graphemes = [a for b in graphemes for a in b]
    graphemes = sorted(set(graphemes))
    phonemes = sorted(set(phonemes))
    return graphemes, phonemes

dict_file_name = cfg.dict_name
g_seq, p_seq = dict_sorting(dict_file_name)
cfg.graphemes = ["<pad>", "<unk>", "</s>"] + g_seq
cfg.phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + p_seq

#Index to grapheme and phones for vectors
cfg.graph2index = {g: idx for idx, g in enumerate(cfg.graphemes)}
cfg.index2graph = {idx: g for idx, g in enumerate(cfg.graphemes)}

cfg.phone2index = {p: idx for idx, p in enumerate(cfg.phonemes)}
cfg.index2phone = {idx: p for idx, p in enumerate(cfg.phonemes)}

# print(cfg.graphemes)
# print(cfg.phonemes)
cfg.g_vocab_size = len(cfg.graphemes)
cfg.p_vocab_size = len(cfg.phonemes)
# print(cfg.g_vocab_size, cfg.p_vocab_size)
# print(cfg.phone2index)
# print(cfg.index2phone)

In [128]:

encoder = g2p.Encoder(cfg.embed_dim, cfg.hidden_dim, cfg.g_vocab_size, cfg.n_layers, cfg.dropout)
decoder = g2p.Decoder(cfg.embed_dim, cfg.hidden_dim, cfg.g_vocab_size, cfg.n_layers, cfg.dropout)

model = g2p.G2PModel(encoder, decoder, cfg.device)
print(torch.cuda.is_available())
#model.to(device=cfg.device)

True


In [129]:
def split_nums_letters(seq):

    def convert_num_word(num):
        nums = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','30','40','50','60','70','80','90','100', '1000']
        num_words = ['nul', 'een','twee', 'drie','vier','vyf', 'ses','sewe', 'agt', 'nege','tien','elf','twaalf','dertien','veertien','vyftien','sestien','sewentien', 'agtien', 'negentien', 
            'twintig','dertig', 'veertig', 'vyftig', 'sestig', 'sewentig', 'tagtig', 'negentig', 'honderd', 'duisend']
        num_tens = ['10','11','12','13','14','15','16','17','18','19','20','30','40','50','60','70','80','90']
        num_words_tens = ['tien','elf','twaalf','dertien','veertien','vyftien','sestien','sewentien', 'agtien', 'negentien', 
            'twintig','dertig', 'veertig', 'vyftig', 'sestig', 'sewentig', 'tagtig', 'negentig']
        a = int(num)
        ans = []
        i=0
        #print(a)
        num_word = ""
        tu_both = 0
        t_added = 0
        h_added = 0
        tt_th_both = 0
        tt_added = 0
        
        u =  str(int(a%10))
        t =  str(int((a%100)/10))
        h =  str(int((a%1000)/100))
        tu = str(a%100)
        th = str(int(a/1000)-int(a/10000)*10)
        tt = str(int(a/10000))
        tt_th = str(int(a/1000))

        if tt_th:
            for k in num_tens:
                if tt_th == k and k!="0":
                    index = num_tens.index(k)
                    num_word = num_word + num_words_tens[index]
                    t_added = 1
                    tt_th_both = 1
                    break
        if tt:
            for k in nums:
                if tt == k and k!="0" and tt_th_both == 0:
                    index = nums.index(k)
                    num_word = num_word + num_words[index]
                    tt_added = 1
                    break
        if th:
            for k in nums:
                if th == k and k!="0" and tt_th_both == 0:
                    index = nums.index(k)
                    if tt_added: num_word = num_word +" en "+  num_words[index]
                    else: num_word = num_word +  num_words[index]
                    t_added =1
                    break
        
        if t_added: num_word += " duisend"

        if h:
            for k in nums:
                if h == k and k!="0":
                    index = nums.index(k)
                    if t_added: num_word = num_word + " en "+ num_words[index] + ' honderd'
                    else: num_word = num_word + num_words[index] + ' honderd'
                    
                    h_added = 1
                    break
        if tu:
            for k in nums:
                if tu == k and k!="0":
                    index = nums.index(k)
                    if h_added: num_word = num_word +" en "+ num_words[index]
                    else: num_word = num_word + num_words[index]
                    tu_both = 1
                    break
        if t:
            for k in nums:
                if t == k and tu_both==0 and k!="0":
                    index = nums.index(k)
                    if h_added: num_word = num_word +" en "+ num_words[index]
                    else: num_word = num_word + num_words[index]
                    break
        if u:
            for k in nums:
                if u == k and tu_both==0 and k!="0":
                    index = nums.index(k)
                    if h_added: num_word = num_word +" en "+ num_words[index]
                    else: num_word = num_word + num_words[index]
                    break
            
        return num_word


    c = re.findall(r'[A-Za-z]+|\d+', seq) #Source: https://stackoverflow.com/questions/28290492/python-splitting-numbers-and-letters-into-sub-strings-with-regular-expression
    ans = []
    for j,i in enumerate(c):
        if i.isdigit():
            ans.append(convert_num_word(i))
        else: ans.append(i)

    return ans

stra="co.32despee208dy"
print(split_nums_letters(stra))
print(split_nums_letters("80"))
print(split_nums_letters("80000"))
print(split_nums_letters('99999'))

a = 28128
print(int((a/1000)))
print(int(a/10000))

['co', 'drietwee', 'despee', 'twee honderd en agt', 'dy']
['tagtig']
['tagtig duisend']
['nege en nege duisend en nege honderd en nege en nege']
28
2


In [133]:
def preprocess_data(data_file_name, nonums_data_file_name):

    nums = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','30','40','50','60','70','80','90','100', '1000']
    num_words = ['nul', 'een','twee', 'drie','vier','vyf', 'ses','sewe', 'agt', 'nege','tien','elf','twaalf','dertien','veertien','vyftien','sestien','sewentien', 'agtien', 'negentien', 
            'twintig','dertig', 'veertig', 'vyftig', 'sestig', 'sewentig', 'tagtig', 'negentig', 'honderd', 'duisend']

    data_file = open(data_file_name, 'r')
    lines_data = data_file.readlines()
    data_file.close()


    #Converting to lowercase
    for i in range(0, len(lines_data)):
        lines_data[i] = lines_data[i].split()
        #l
        
        for j in range(0, len(lines_data[i])):
            lines_data[i][j] = lines_data[i][j].lower()

                 

    with open(nonums_data_file_name, 'w') as file:
        for i in range(0, len(lines_data)):
            new_line = " ".join([word for word in lines_data[i][1:]])
            file.write('%s \n' %new_line)
    file.close()

def see_all_unique_words(data_file_name, unique_words_file):

    nums = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','30','40','50','60','70','80','90','100']
    num_words = ['nul', 'een','twee', 'drie','vier','vyf', 'ses','sewe', 'agt', 'nege','tien','elf','twaalf','dertien','veertien','vyftien','sestien','sewentien', 'agtien', 'negentien', 
            'twintig','dertig', 'veertig', 'vyftig', 'sestig', 'sewentig', 'tagtig', 'negentig', 'een honderd']

    data_file = open(data_file_name, 'r')
    lines_data = data_file.readlines()
    data_file.close()


    #Converting to lowercase
    for i in range(0, len(lines_data)):
        lines_data[i] = split_nums_letters(lines_data[i])
        for j in range(0, len(lines_data[i])):
            lines_data[i][j] = lines_data[i][j].replace("-", " ")
            lines_data[i][j] = lines_data[i][j].replace("+", " ")
            lines_data[i][j] = lines_data[i][j].replace("%", "persent")
            lines_data[i][j] = lines_data[i][j].replace('\"', "")
            lines_data[i][j] = lines_data[i][j].replace('\'', "")
            lines_data[i][j] = lines_data[i][j].replace("!", "")
            lines_data[i][j] = lines_data[i][j].replace("?", "")
            lines_data[i][j] = lines_data[i][j].replace(".", "")
            lines_data[i][j] = lines_data[i][j].replace(",", "")
            if j > 0: lines_data[i][j] = lines_data[i][j].replace("_", "")
            for k in nums:
                if lines_data[i][j] == k:
                    index = nums.index(k)
                    lines_data[i][j] = num_words[index]
                    break

    lines_data_flattened = [i for j in lines_data for i in j]
    # print(len(lines_data_flattened))               

    unique = set(lines_data_flattened)
    sorted_unique = sorted(unique)
    # print(len(sorted_unique))
    # print(sorted_unique) 

    with open(unique_words_file, 'w') as file:
        for i in range(0, len(sorted_unique)):
            new_line = sorted_unique[i]
            file.write('%s\n' %new_line)
    file.close() 

data_file_name = "TTScorpora.tsv"
data_file_without_index = "no_index_file.txt"
data_unique_words = "unique_words.txt"
preprocess_data(data_file_name, data_file_without_index)
see_all_unique_words(data_file_without_index, data_unique_words)


In [136]:
def pop_invalid_entries (data_file_name, dict_file_name, processed_data_file_name):
    data_file = open(data_file_name, 'r')
    dict_file = open(dict_file_name, 'r')
    lines_data = data_file.readlines()
    lines_dict = dict_file.readlines()
    dict_file.close()
    data_file.close()

    dictionary = []
    for i in range(0, len(lines_dict)):
        lines_dict[i] = lines_dict[i].split()
        dictionary.append(lines_dict[i][0])

    words_to_add = []

    for i in lines_data:
        word_in_dict = 0
        i = i.replace("\n","")
        for j in dictionary:
            if i == j:
                word_in_dict = 1
                break
        if word_in_dict == 0:
            words_to_add.append(i)

    print(f"words_to_add len:{len(words_to_add)}")
    print(f"words_to_add:{(words_to_add)}")
    with open(processed_data_file_name, 'w') as file:
        for i in range(0, len(words_to_add)):
            new_line = words_to_add[i]
            file.write('%s\n' %new_line)
    file.close() 

data_file_name = "TTScorpora.tsv"
dict_file_name = "rcrl_apd.1.4.1.txt"
data_file_without_index = "no_index_file.txt"
data_unique_words = "unique_words.txt"
data_words_to_G2P = "words_to_G2P.txt"

pop_invalid_entries (data_unique_words, dict_file_name, data_words_to_G2P)

words_to_add len:3572
words_to_add:['', 'aangel', 'aarbeie', 'aasvo', 'abandonment', 'abbotsdale', 'aberfeldy', 'able', 'abnormal', 'about', 'abrahams', 'abstractions', 'absurdity', 'abundance', 'accept', 'accidental', 'accordance', 'accounting', 'aces', 'acquaintance', 'acquiescence', 'across', 'act', 'acts', 'actual', 'adamstein', 'added', 'adjectives', 'adonis', 'adriaan', 'adrift', 'advantage', 'advantages', 'adventure', 'adverbs', 'affected', 'afgel', 'afgesonderde', 'afghanistan', 'afl', 'afpeul', 'afrika', 'afrikaans', 'afskryfwaarde', 'afslagitem', 'after', 'afternoon', 'again', 'against', 'aged', 'agents', 'aggeneys', 'aggie', 'agnes', 'ago', 'agony', 'agt honderd en agt en drie', 'agt honderd en dertig', 'agt honderd en drie en nege', 'agt honderd en ses en agt', 'agt honderd en ses en vyf', 'agt honderd en sewe en drie', 'agt honderd en sewe en sewe', 'agt honderd en sewe en vyf', 'agt honderd en sewentien', 'agt honderd en twee en agt', 'agt honderd en twee en drie', 'agt h

In [None]:
from curses.ascii import isdigit


a = ['12-8', '120', '121']
for i in a:
    if i.isdigit(): print(True)

True
True
