In [None]:
%load_ext tensorboard

In [2]:
import glob
import pandas as pd
import numpy as np


In [7]:
allFiles = glob.glob("dp/notebooks/lexicons/*")
allFiles.sort()
allDF = []

for f in allFiles:
    filename = f.split("/")[3][:-4]
    df = pd.read_csv(f, dtype=str,
        encoding='utf-8', sep='\t',
        names=['grapheme', 'phoneme'])
    df.insert(2, 'filename', filename)
    allDF.append(df)

df = pd.concat(allDF, ignore_index=True)
df.insert(0, 'lang', 'pt_br')

In [8]:
df['grapheme'] = df['grapheme'].map(str)
df['phoneme'] = df['phoneme'].map(str)

graphemes = ''.join(sorted(list(set(df['grapheme'].sum()))))

In [9]:
graphemes

' !()+,-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªÀÁÂÉÊÍÓÚàáâãçéêíóôõöúûüý'

In [87]:
df['gWords'] = df['grapheme'].str.count(' ') + 1
df['pWords'] = df['phoneme'].str.count(' ') + 1
df['grapheme'] = df['grapheme'].str.split(' ')
df['phoneme'] = df['phoneme'].str.split(' ')

In [88]:
df = (df.groupby('filename', as_index=False)
    .apply(lambda x: x.reset_index(drop=True)).reset_index().drop('level_0', axis=1))
df.level_1 = df.level_1 + 1

In [89]:
df[df.gWords > df.pWords].apply(lambda x:
    x.phoneme.extend([''] * (x.gWords - x.pWords)), axis=1)
df[df.pWords > df.gWords].apply(lambda x:
    x.grapheme.extend([''] * (x.pWords - x.gWords)), axis=1)
dfWords = (df.apply(pd.Series.explode)
    .drop(['lang', 'gWords', 'pWords'], axis=1))

In [6]:
allFilesFixed = glob.glob("dp/notebooks/lexicons_fixed/*")
allFilesFixed.sort()
allDFFixed = []

for f in allFilesFixed:
    filename = f.split("/")[3][:-4]
    dfFixed = pd.read_csv(f, dtype=str,
        encoding='utf-8', sep='\t',
        names=['grapheme', 'phoneme'])
    dfFixed.insert(2, 'filename', filename)
    allDFFixed.append(dfFixed)

dfFixed = pd.concat(allDFFixed, ignore_index=True)
dfFixed.insert(0, 'lang', 'pt_br')

ValueError: No objects to concatenate

In [91]:
dfFixed['gWords'] = dfFixed['grapheme'].str.count(' ') + 1
dfFixed['pWords'] = dfFixed['phoneme'].str.count(' ') + 1
dfFixed['grapheme'] = dfFixed['grapheme'].str.split(' ')
dfFixed['phoneme'] = dfFixed['phoneme'].str.split(' ')

In [92]:
dfFixed = (dfFixed.groupby('filename', as_index=False)
    .apply(lambda x: x.reset_index(drop=True)).reset_index().drop('level_0', axis=1))
dfFixed.level_1 = dfFixed.level_1 + 1

In [93]:
dfFixed[dfFixed.gWords > dfFixed.pWords].apply(lambda x:
    x.phoneme.extend([''] * (x.gWords - x.pWords)), axis=1)
dfFixed[dfFixed.pWords > dfFixed.gWords].apply(lambda x:
    x.grapheme.extend([''] * (x.pWords - x.gWords)), axis=1)
dfFixedWords = (dfFixed.apply(pd.Series.explode)
    .drop(['lang', 'gWords', 'pWords'], axis=1))

In [94]:
phonemes = dfWords['phoneme']
phonemesFixed = dfFixedWords['phoneme']

In [95]:
(phonemes != phonemesFixed).sum()

24653

In [96]:
phon = dfWords[phonemesFixed != phonemes][['filename', 'level_1', 'grapheme', 'phoneme']]
phonFixed = dfFixedWords[phonemes != phonemesFixed][['filename', 'level_1', 'grapheme', 'phoneme']]

In [97]:
phon.to_csv('phon.txt', index=False)
phonFixed.to_csv('phonFixed.txt', index=False)

In [27]:
dfWords.drop_duplicates(subset=['grapheme', 'phoneme']).to_csv('words.txt')

In [12]:
dfFixedWords.drop_duplicates(subset=['grapheme', 'phoneme']).to_csv('words_fixed.txt')

In [3]:
df['grapheme'] = df['grapheme'].map(str)
df['phoneme'] = df['phoneme'].map(str)

graphemes = ''.join(sorted(list(set(df['grapheme'].sum()))))

phonemes = (
    df['phoneme'].str.split("\\")
        .explode().drop_duplicates()
        .sort_values().reset_index(drop=True)
        .values.tolist()
)

phonemes.append('~')

In [4]:
phonemes

['',
 ' ',
 " '",
 "'",
 '.',
 'E',
 'J',
 'L',
 'O',
 'R',
 'S',
 'X',
 'Z',
 'a',
 'a~',
 'b',
 'd',
 'dZ',
 'e',
 'ej',
 'e~',
 'e~j~',
 'f',
 'g',
 'i',
 'i~',
 'j',
 'js',
 'j~',
 'j~s',
 'k',
 'l',
 'm',
 'n',
 'o',
 'ow',
 'o~',
 'p',
 'pau',
 'r',
 's',
 't',
 'tS',
 'u',
 'u~',
 'v',
 'w',
 'w~',
 'z',
 '~']

In [108]:
allFilesFixed = glob.glob("dp/notebooks/lexicons_fixed/*")
allFilesFixed.sort()
allDFFixed = []

for f in allFilesFixed:
    filename = f.split("/")[3][:-4]
    dfFixed = pd.read_csv(f, dtype=str,
        encoding='utf-8', sep='\t',
        names=['grapheme', 'phoneme'])
    dfFixed.insert(2, 'filename', filename)
    allDFFixed.append(dfFixed)

dfFixed = pd.concat(allDFFixed, ignore_index=True)
dfFixed['phoneme'] = dfFixed['phoneme'].str.replace('\\\\pau\\\\',",")
dfFixed['phoneme'] = dfFixed['phoneme'].str.replace('\\','')

In [109]:
dfFixed.to_csv('lex.txt')

In [3]:
def levenshtein(target, predicted):
    d = np.zeros((len(target) + 1) * (len(predicted) + 1), dtype=np.uint8)
    d = d.reshape((len(target) + 1, len(predicted) + 1))
    for i in range(len(target) + 1):
        for j in range(len(predicted) + 1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i

    for i in range(1, len(target) + 1):
        for j in range(1, len(predicted) + 1):
            if target[i - 1] == predicted[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                substitution = d[i - 1][j - 1] + 1
                insertion = d[i][j - 1] + 1
                deletion = d[i - 1][j] + 1
                d[i][j] = min(substitution, insertion, deletion)

    return d[len(target)][len(predicted)], len(target)


In [8]:
target = "e~'ta~w~, ki des'kuw.pa me'LOX 'pa.ra fi.kaR.muZu~'tus"
predicted = "e~'ta~w~pau ki dZi ki 'pla 'mO.La pra'zi u~ 'muj~"

res = levenshtein(target=target, predicted=predicted)


In [9]:
res

(31, 54)

In [None]:
["'", 'm', 'a', '.', 't', 'a', ' ', "'", 's', ' ', "'", 's', ' ', "'", ...]