In [122]:
from io import open
import unicodedata
import string
import re
import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
from distance import levenshtein
import os
import math
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils import data
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import import_ipynb
import G2P_LSTM as g2p

data_file = 'TTScorpora.tsv'

In [123]:
cfg = g2p.cfg
def dict_sorting (dict_file_name):
    dict_file = open(dict_file_name, 'r')
    lines_dict = dict_file.readlines()
    dict_file.close()

    graphemes = []
    phonemes = []

    for i in range(0, len(lines_dict)):
        lines_dict[i] = lines_dict[i].split()
        graphemes.append([*lines_dict[i][0]])
        phonemes.append(lines_dict[i][1:])
    phonemes = [a for b in phonemes for a in b]
    graphemes = [a for b in graphemes for a in b]
    graphemes = sorted(set(graphemes))
    phonemes = sorted(set(phonemes))
    return graphemes, phonemes

dict_file_name = cfg.dict_name
g_seq, p_seq = dict_sorting(dict_file_name)
cfg.graphemes = ["<pad>", "<unk>", "</s>"] + g_seq
cfg.phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + p_seq

#Index to grapheme and phones for vectors
cfg.graph2index = {g: idx for idx, g in enumerate(cfg.graphemes)}
cfg.index2graph = {idx: g for idx, g in enumerate(cfg.graphemes)}

cfg.phone2index = {p: idx for idx, p in enumerate(cfg.phonemes)}
cfg.index2phone = {idx: p for idx, p in enumerate(cfg.phonemes)}

# print(cfg.graphemes)
# print(cfg.phonemes)
cfg.g_vocab_size = len(cfg.graphemes)
cfg.p_vocab_size = len(cfg.phonemes)
# print(cfg.g_vocab_size, cfg.p_vocab_size)
# print(cfg.phone2index)
# print(cfg.index2phone)

In [124]:

encoder = g2p.Encoder(cfg.embed_dim, cfg.hidden_dim, cfg.g_vocab_size, cfg.n_layers, cfg.dropout)
decoder = g2p.Decoder(cfg.embed_dim, cfg.hidden_dim, cfg.g_vocab_size, cfg.n_layers, cfg.dropout)

model = g2p.G2PModel(encoder, decoder, cfg.device)
print(torch.cuda.is_available())
#model.to(device=cfg.device)

True


In [195]:
def split_nums_letters(seq):

    def convert_num_word(num):
        nums = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','30','40','50','60','70','80','90','100', '1000']
        num_words = ['nul', 'een','twee', 'drie','vier','vyf', 'ses','sewe', 'agt', 'nege','tien','elf','twaalf','dertien','veertien','vyftien','sestien','sewentien', 'agtien', 'negentien', 
            'twintig','dertig', 'veertig', 'vyftig', 'sestig', 'sewentig', 'tagtig', 'negentig', 'honderd', 'duisend']
        a = int(num)
        ans = []
        i=0
        #print(a)
        num_word = ""
        tu_both = 0
        t_added = 0
        h_added = 0
        
        u =  str(int(a%10))
        t =  str(int((a%100)/10))
        h =  str(int((a%1000)/100))
        tu = str(a%100)
        th = str(int(a/1000))
        tt = str(int(a/10000))
        # ht = str(int(a/100000))
        # m =  str(int(a/1000000))

        # x = [u,t,h]
        # b = list(reversed(x))
        #print(f"tt: {tt} th: {th} h: {h} tu: {tu} t: {t} u: {u}")
        if tt:
            for k in nums:
                if tt == k and k!="0":
                    index = nums.index(k)
                    ans.append(num_words[index])
                    num_word = num_word + num_words[index]
                    t_added = 1
                    break
        if th:
            for k in nums:
                if th == k and k!="0":
                    index = nums.index(k)
                    num_word = num_word + num_words[index]
                    t_added =1
                    break
        
        if t_added: num_word += " duisend"

        if h:
            for k in nums:
                if h == k and k!="0":
                    index = nums.index(k)
                    num_word = num_word + " "+ num_words[index] + ' honderd'
                    h_added = 1
                    break
        if tu:
            for k in nums:
                if tu == k:
                    index = nums.index(k)
                    if h_added: num_word = num_word +" en "+ num_words[index]
                    else: num_word = num_word +" "+ num_words[index]
                    tu_both = 1
                    break
        if t:
            for k in nums:
                if t == k and tu_both==0:
                    index = nums.index(k)
                    if h_added: num_word = num_word +" en "+ num_words[index]
                    else: num_word = num_word +" "+ num_words[index]
                    break
        if u:
            for k in nums:
                if u == k and tu_both==0:
                    index = nums.index(k)
                    num_word = num_word +" "+ num_words[index]
                    break
        # #x = [u,t,h,th,tt,ht,m]
        # # for i in range(len(b)):
        #     for k in nums:
        #         if b[i] == k and not(b[i] == '0' and i > 0):
        #             index = nums.index(k)
        #             ans.append(num_words[index])
        #             break
            
        return num_word


    c = re.findall(r'[A-Za-z]+|\d+', seq)
    ans = []
    for j,i in enumerate(c):
        if i.isdigit():
            ans.append(convert_num_word(i))
        else: ans.append(i)

    return ans





In [197]:
stra="co32despee208dy"
print(split_nums_letters(stra))
print(convert_num_word("767"))
print(convert_num_word(1128))

a = 1128
print(int((a%1000)/100))

['co', ' drie twee', 'despee', ' twee honderd en agt', 'dy']
 sewe honderd en ses sewe
een duisend een honderd en twee agt
1


In [198]:
def preprocess_data(data_file_name, nonums_data_file_name):

    nums = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','30','40','50','60','70','80','90','100', '1000']
    num_words = ['nul', 'een','twee', 'drie','vier','vyf', 'ses','sewe', 'agt', 'nege','tien','elf','twaalf','dertien','veertien','vyftien','sestien','sewentien', 'agtien', 'negentien', 
            'twintig','dertig', 'veertig', 'vyftig', 'sestig', 'sewentig', 'tagtig', 'negentig', 'honderd', 'duisend']

    data_file = open(data_file_name, 'r')
    lines_data = data_file.readlines()
    data_file.close()


    #Converting to lowercase
    for i in range(0, len(lines_data)):
        # lines_data[i] = lines_data[i].split()
        lines_data[i] = split_nums_letters(lines_data[i])
        
        for j in range(0, len(lines_data[i])):
            lines_data[i][j] = lines_data[i][j].lower()
            lines_data[i][j] = lines_data[i][j].replace("-", " ")
            lines_data[i][j] = lines_data[i][j].replace("+", " ")
            lines_data[i][j] = lines_data[i][j].replace("%", "persent")
            lines_data[i][j] = lines_data[i][j].replace('"', '')
            lines_data[i][j] = lines_data[i][j].replace('\'', "")
            lines_data[i][j] = lines_data[i][j].replace("!", "")
            lines_data[i][j] = lines_data[i][j].replace("?", "")
            lines_data[i][j] = lines_data[i][j].replace(".", "")
            lines_data[i][j] = lines_data[i][j].replace(",", "")
            if j > 0: lines_data[i][j] = lines_data[i][j].replace("_", "")
            for k in nums:
                if lines_data[i][j] == k:
                    index = nums.index(k)
                    lines_data[i][j] = num_words[index]
                    break
                 

    with open(nonums_data_file_name, 'w') as file:
        for i in range(0, len(lines_data)):
            new_line = " ".join([word for word in lines_data[i][1:]])
            file.write('%s \n' %new_line)
    file.close()

def see_all_unique_words(data_file_name):
        
    def replace_numbers(stra):

        return stra

    nums = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','30','40','50','60','70','80','90','100']
    num_words = ['nul', 'een','twee', 'drie','vier','vyf', 'ses','sewe', 'agt', 'nege','tien','elf','twaalf','dertien','veertien','vyftien','sestien','sewentien', 'agtien', 'negentien', 
            'twintig','dertig', 'veertig', 'vyftig', 'sestig', 'sewentig', 'tagtig', 'negentig', 'een honderd']

    data_file = open(data_file_name, 'r')
    lines_data = data_file.readlines()
    data_file.close()


    #Converting to lowercase
    for i in range(0, len(lines_data)):
        lines_data[i] = lines_data[i].split()
        for j in range(0, len(lines_data[i])):
            lines_data[i][j] = lines_data[i][j].lower()
            for k in nums:
                if lines_data[i][j] == k:
                    index = nums.index(k)
                    lines_data[i][j] = num_words[index]
                    break

    lines_data_flattened = [i for j in lines_data for i in j]
    print(len(lines_data_flattened))               

    unique = set(lines_data_flattened)
    list = sorted(unique)
    print(len(list))
    print(list)  

preprocess_data("TTScorpora.tsv", "nowordsfile")
see_all_unique_words("nowordsfile")


60219
5820
['a', 'aan', 'aanbeveel', 'aanbied', 'aand', 'aandag', 'aandrywing', 'aangaan', 'aangebring', 'aangee', 'aangegee', 'aangehou', 'aangel', 'aangemeld', 'aangenome', 'aangepas', 'aangesien', 'aangestel', 'aangeteken', 'aangetref', 'aanhaling', 'aankoopprys', 'aanleiding', 'aanloklik', 'aanloopbaan', 'aanmekaar', 'aanmoedig', 'aanneem', 'aanneming', 'aanpak', 'aansoek', 'aantal', 'aanteken', 'aanvaar', 'aanval', 'aanvaller', 'aanvangskolwer', 'aanvanklik', 'aar', 'aarbeie', 'aardbewings', 'aarde', 'aardverwarming', 'aasvo', 'abandonment', 'abbotsdale', 'aberfeldy', 'able', 'abnormal', 'about', 'abrahams', 'abstractions', 'absurdity', 'abundance', 'accept', 'accidental', 'accordance', 'accounting', 'aces', 'acquaintance', 'acquiescence', 'across', 'act', 'acts', 'actual', 'adamstein', 'added', 'adjectives', 'administrateur', 'adonis', 'adriaan', 'adrift', 'advantage', 'advantages', 'adventure', 'adverbs', 'af', 'afbetaal', 'afbreek', 'afdeling', 'affected', 'afgee', 'afgekeur', 

In [128]:
print(5%10)

5


In [129]:
import re
a = "twee10"
b = re.split('(\d+)', a)
print(b)
nums = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','30','40','50','60','70','80','90','100']
num_words = ['nul','twee', 'drie','vier', 'ses','sewe', 'agt', 'nege','tien','elf','twaalf','dertien','veertien','vyftien','sestien','sewentien', 'agtien', 'negentien', 
'twintig','dertig', 'veertig', 'vyftig', 'sestig', 'sewentig', 'tagtig', 'negentig', 'een honderd']
contained = [a for a in b if a in nums]
if contained: print("True")
print(contained)

['twee', '10', '']
True
['10']


In [130]:
from curses.ascii import isdigit


a = ['12-8', '120', '121']
for i in a:
    if i.isdigit(): print(True)

True
True
