In [3]:
import pandas as pd

In [7]:
lexique_df = pd.read_csv("../paraphone/data/dictionaries/lexique_383.tsv", delimiter="\t")

In [8]:
short_df = lexique_df[["ortho", "phon", "syll"]]

In [9]:
phonemes = set()
for i, row in short_df.iterrows():
    phonemes.update(set(row["phon"]))
phonemes

{'1',
 '2',
 '5',
 '8',
 '9',
 '@',
 'E',
 'G',
 'N',
 'O',
 'R',
 'S',
 'Z',
 'a',
 'b',
 'd',
 'e',
 'f',
 'g',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '§',
 '°'}

In [11]:
# finding all words with o/O in lexique
o_words = {}
for i, row in short_df.iterrows():
    if {"o", "O"} & set(row["phon"]):
        o_words[row["ortho"]] = row["phon"]

reversed_o_words = {v: k for k, v in o_words.items()}
little_o_pho = {word: pho.replace("O", "o")
                for word, pho in o_words.items()
                if pho.replace("O", "o") != pho}
found_pairs = []
for little_o_word, little_o in little_o_pho.items():
    if little_o in reversed_o_words:
        found_pairs.append((little_o_word, reversed_o_words[little_o]))

In [19]:
import csv

with open("../data/o_pairs_lexique.csv", "w") as pairs_file:
    csv_writer = csv.writer(pairs_file, delimiter="\t")
    for little_o, big_o in found_pairs:
        csv_writer.writerow((
            little_o,
            o_words[little_o],
            big_o,
            o_words[big_o]
        ))


In [24]:
# finding all words with o/O in CMU
o_words = {}
with open("../paraphone/data/dictionaries/cmu_fr.txt") as cmu_fr:
    for row in cmu_fr:
        word, *pho = row.strip().split(" ")
        if word.endswith(")"):
            continue
        if {"au", "oo"} & set(pho):
            o_words[word] = " ".join(pho)

reversed_o_words = {v: k for k, v in o_words.items()}
little_o_pho = {word: pho.replace("oo", "au")
                for word, pho in o_words.items()
                if pho.replace("oo", "au") != pho}
found_pairs = []
for little_o_word, little_o in little_o_pho.items():
    if little_o in reversed_o_words:
        found_pairs.append((little_o_word, reversed_o_words[little_o]))

('abandonner', 'abandonnez')

In [27]:
import csv

with open("../data/o_pairs_cmu_fr.csv", "w") as pairs_file:
    csv_writer = csv.writer(pairs_file, delimiter="\t")
    for little_o, big_o in found_pairs:
        csv_writer.writerow((
            little_o,
            o_words[little_o],
            big_o,
            o_words[big_o]
        ))

In [13]:
import re

regex = re.compile("8(.)")
with_8 = set()
for i, row in short_df.iterrows():
    phones: str = row["phon"]
    with_8.update(set(regex.findall(phones)))
with_8

{'2', '5', '9', '@', 'E', 'O', 'a', 'e', 'i', 'o', 'y', '§'}

In [None]:
# Investigations for the CELEX dict

In [1]:
import csv

all_phonemes_chars = set()
all_phonemes = set()
with open("../paraphone/data/foldings/en/celex.csv") as celex_fold_file:
    reader = csv.DictReader(celex_fold_file, delimiter=",")
    for row in reader:
        phon_celex = row["phon_celex"]
        all_phonemes_chars.update(set(phon_celex))
        all_phonemes.update(phon_celex.split(" "))
all_phonemes_chars.remove(" ")

In [2]:
all_phonemes

{'3:',
 '@',
 '@U',
 'A:',
 'A~:',
 'D',
 'E',
 'I',
 'I@',
 'N',
 'O:',
 'OI',
 'O~:',
 'Q',
 'S',
 'T',
 'U',
 'U@',
 'V',
 'Z',
 'aI',
 'aU',
 'b',
 'd',
 'dZ',
 'eI',
 'f',
 'g',
 'h',
 'i:',
 'j',
 'k',
 'l',
 'm',
 'n',
 'p',
 'r',
 's',
 't',
 'tS',
 'u:',
 'v',
 'w',
 'x',
 'z',
 '{'}

In [34]:
from tqdm import tqdm
import re

clx_phon_re = re.compile(r"\[(.+)\]")

rejected_phonemes_char = set()
total_count, rejected_count = 0, 0
with open("../data/celex.txt") as celex_dic, open("../data/rejected.csv", "w") as rej_file:
    rej_csv = csv.writer(rej_file, delimiter="\t")
    for row in tqdm(celex_dic):
        row = row.strip().split("\\")
        total_count += 1
        re_match = clx_phon_re.findall(row[6])
        phonemized_word = "".join(re_match)
        rejected = set(phonemized_word) - all_phonemes_chars
        if rejected:
            rejected_count += 1
            rejected_phonemes_char.update(rejected)
            rej_csv.writerow((row[1], row[6], rejected))

total_count, rejected_count

160595it [00:00, 450926.01it/s]


(160595, 0)

In [20]:
set(phonemized_word)

{'1'}

In [5]:
from phonemizer.backend import EspeakBackend
from phonemizer.separator import Separator
from paraphone.utils import null_logger
from tqdm import tqdm
import csv
import re

clx_phon_re = re.compile(r"\[(.+?)\]")
separator = Separator(phone="", word=None)
backend = EspeakBackend(
    "en-us",
    language_switch="remove-utterance",
    logger=null_logger())

with open("../data/celex.txt") as celex_dic, \
        open("../data/r*_words.csv", "w") as rej_file:
    r_csv = csv.writer(rej_file, delimiter="\t")
    for row in tqdm(celex_dic):
        row = row.strip().split("\\")
        re_match = clx_phon_re.findall(row[8])
        phonemized_word = "".join(re_match)
        if not phonemized_word.endswith("r*"):
            continue

        espeak_phon = backend.phonemize(text=[row[1]],separator=separator, strip=True)
        if espeak_phon[0].endswith("ɛɹ"):
            r_csv.writerow(
                (row[1], phonemized_word, espeak_phon[0])
            )

160595it [00:01, 89175.02it/s] 


In [7]:
sutck_pho_CMU = {'AA0',
 'AA1',
 'AA2',
 'AE0',
 'AE1',
 'AE2',
 'AH1',
 'AH2',
 'AO0',
 'AO1',
 'AO2',
 'AW0',
 'AW1',
 'AY0',
 'AY1',
 'AY2',
 'EH0',
 'EH1',
 'EH2',
 'ER0',
 'ER1',
 'ER2',
 'EY0',
 'EY1',
 'EY2',
 'IH0',
 'IH1',
 'IH2',
 'IY0',
 'IY1',
 'IY2',
 'OW0',
 'OW1',
 'OW2',
 'OY0',
 'OY1',
 'OY2',
 'UH0',
 'UH1',
 'UH2',
 'UW0',
 'UW1',
 'UW2'}
sutck_pho_CMU

{'AA0',
 'AA1',
 'AA2',
 'AE0',
 'AE1',
 'AE2',
 'AH1',
 'AH2',
 'AO0',
 'AO1',
 'AO2',
 'AW0',
 'AW1',
 'AY0',
 'AY1',
 'AY2',
 'EH0',
 'EH1',
 'EH2',
 'ER0',
 'ER1',
 'ER2',
 'EY0',
 'EY1',
 'EY2',
 'IH0',
 'IH1',
 'IH2',
 'IY0',
 'IY1',
 'IY2',
 'OW0',
 'OW1',
 'OW2',
 'OY0',
 'OY1',
 'OY2',
 'UH0',
 'UH1',
 'UH2',
 'UW0',
 'UW1',
 'UW2'}