In [1]:
ds_path = '/home/boris/Projects/Voice_Assistant_for_Voice_Anomaly_Persons/Multi-lingual Phoneme Recognition/data/xls-r_dataset_v4.csv'

In [2]:
from collections import defaultdict, Counter
from typing import Iterable

import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
import json

f = open("/home/boris/Projects/Voice_Assistant_for_Voice_Anomaly_Persons/Multi-lingual Phoneme Recognition/models/processor/vocab.json")
phonemes: dict[str, int] = json.load(f)
f.close()

phonemes_reverse ={j:i for i, j in phonemes.items()}

In [5]:
df = pd.read_csv(ds_path)

In [6]:
df.iloc[0]['IPA'], df.iloc[0]['text']

('k r a n d i', 'радио')

In [7]:
train_df, test_df = train_test_split(df)

In [8]:
len(train_df), len(test_df)

(1335, 445)

In [9]:
def get_triplets(ipa: list[str], text: str) -> Iterable[tuple[tuple[str, str], str]]:
    for i in range(min(len(ipa)-1, len(text))):
        yield (ipa[i], ipa[i+1]), text[i]

In [10]:
example = get_triplets(train_df.iloc[0]['IPA'].split(), train_df.iloc[0]['text'])

In [11]:
def prepare_text(raw: str) -> str:
    return ''.join(i.lower() for i in raw if i.isalpha())

In [12]:
class HMMState:
    _state: defaultdict[tuple[str, str], Counter]
    def __init__(self) -> None:
        self._state = defaultdict(lambda: Counter())

In [13]:
len(phonemes.keys())

392

In [14]:
letters = set()
phonemes_red = set()
for _, row in df.iterrows():
    for ph, l in get_triplets(row['IPA'].split(), prepare_text(row['text'])):
        letters.add(l)
        phonemes_red.add(ph[0])
        phonemes_red.add(ph[1])
print(letters)
print(phonemes_red)

{'и', 'д', 'ц', 'ш', 'o', 'й', 'т', 't', 'р', 'ё', 'e', 'о', 'л', 'ч', 'б', 'ъ', 'y', 'ж', 'э', 'b', 'ю', 'ь', 'з', 'ы', 'u', 'а', 'у', 'к', 'щ', 'е', 'н', 'с', 'ф', 'п', 'м', 'c', 'в', 'х', 'я', 'г'}
{'ə', 'k', 'æ', 'ɟ', 'ɑ', 'ø', 'o', 'd', 't', 'ɪ', 'h', 'sʲ', 'e', 'ɑɨ', 'ts', 'ʌ', 'ɯ', 'iː', 'v', 'ɑː', 'j', 'y', 'r', 'ʒ', 'p', 'dʒ', 'q', 'b', 'ŋ', 'uː', 'ɚ', 's', 'ç', 'z', 'ɬ', 'u', 'x', 'ɕ', 'ʁ', 'ð', 'ɨ', 'i', 't[', 'ɣ', 'ɾ', 'f', 'n', 'ʎ', 'ɹ', 'ɔ', 'aɪ', 'eɪ', 'ʉ', 'aː', 'ɭ', 'l', 'ʃ', 'w', 'oɪ', 'tʃ', 'eː', 'œ', 'ɔɨ', 'a', 'm', 'ɲ', 'ʊ', 'oʊ', 'θ', 'tʲ', 'ɛ', 'ɡ', 'oː'}


In [15]:
sorted_ab = sorted(letters)
sorted_ph = sorted(phonemes_red)

In [16]:
hmm_state_np = np.zeros((len(sorted_ph)**2,len(sorted_ab)), dtype=float)

In [17]:
hmm_state = HMMState()

In [18]:
t=0
for _, row in train_df.iterrows():
    for ph, l in get_triplets(row['IPA'].split(), prepare_text(row['text'])):
        bigram_index = sorted_ph.index(ph[0])*len(sorted_ph) + sorted_ph.index(ph[1])
        letter = sorted_ab.index(l)
        hmm_state_np[bigram_index, :] = hmm_state_np[bigram_index, :]*t/(t+1)
        hmm_state_np[bigram_index, letter] += 1/(t+1)
        t+=1

In [19]:
t

30915

In [20]:
2/3,1/3

(0.6666666666666666, 0.3333333333333333)

In [21]:
3/4,1/4

(0.75, 0.25)

In [22]:
hmm_state_np.min(), hmm_state_np.max()

(0.0, 0.9524713943836081)

In [45]:
np.save('hmm2k.npy', hmm_state_np)

evaluating

In [23]:
def predict(ipa: str):
    test_ipa = ipa.split()
    k=2
    result = []
    for i in range(len(test_ipa)-1):
        p1, p2 = test_ipa[i], test_ipa[i+1]
        p1=sorted_ph.index(p1)
        p2=sorted_ph.index(p2)
        raw_probs = hmm_state_np[ p1*len(sorted_ph)+ p2]
        pbs = []
        for ix, p in enumerate(raw_probs):
            pbs.append((sorted_ab[ix], p))
            pbs = sorted(pbs, key=lambda x: -x[1])
            if len(pbs)>k:
                pbs.pop()
        # print(pbs)
        result.append(pbs)
    return result

In [24]:
from Levenshtein import ratio

In [25]:
def get_ratios(prediction: list[list[tuple[str, float]]], actual: str):
    return ratio(actual, ''.join(x[0][0] for x in prediction))

In [42]:
1-ratio('abc', 'abc')

0.0

In [26]:
test_ipa = 's a l n e tʃ u n i k d e n'

In [27]:
prediction = predict(test_ipa)
prediction

[[('о', 0.0040671743126617175), ('с', 0.003048064478658715)],
 [('о', 0.008982258084513575), ('я', 0.008141906869474073)],
 [('п', 0.005647576651187478), ('о', 0.0015788877311761523)],
 [('о', 0.0402859165214735), ('а', 0.014269044481870385)],
 [('ч', 0.005221815360348069), ('е', 0.0008150513600599255)],
 [('ч', 0.00024209121134581663), ('у', 5.770007501009751e-05)],
 [('а', 0.0021814841188397367), ('н', 0.0019928932106240966)],
 [('н', 0.04099674019032559), ('е', 0.02001048283648116)],
 [('к', 0.008560269466992298), ('е', 0.006117210314857043)],
 [('д', 0.0010205671378321418), ('л', 0.0004652307685292817)],
 [('е', 0.046886604360172744), ('д', 0.005644232124571832)],
 [('н', 0.014896529343130026), ('е', 0.011039223890060058)]]

In [28]:
text='солнечныйдень'

top 1 CER

In [29]:
get_ratios(prediction, text)

0.48

top 2 CER

In [30]:
from itertools import product

In [31]:
prediction

[[('о', 0.0040671743126617175), ('с', 0.003048064478658715)],
 [('о', 0.008982258084513575), ('я', 0.008141906869474073)],
 [('п', 0.005647576651187478), ('о', 0.0015788877311761523)],
 [('о', 0.0402859165214735), ('а', 0.014269044481870385)],
 [('ч', 0.005221815360348069), ('е', 0.0008150513600599255)],
 [('ч', 0.00024209121134581663), ('у', 5.770007501009751e-05)],
 [('а', 0.0021814841188397367), ('н', 0.0019928932106240966)],
 [('н', 0.04099674019032559), ('е', 0.02001048283648116)],
 [('к', 0.008560269466992298), ('е', 0.006117210314857043)],
 [('д', 0.0010205671378321418), ('л', 0.0004652307685292817)],
 [('е', 0.046886604360172744), ('д', 0.005644232124571832)],
 [('н', 0.014896529343130026), ('е', 0.011039223890060058)]]

In [37]:
def get_substrs(p):
    if len(p)==1:
        return (i[0] for i in p[0])
    try:
        substrs = get_substrs(p[1:])
    except RecursionError:
        substrs = (i[0] for i in p[1])
    return (i[0]+j for i in p[0] for j in substrs)

In [33]:
def get_top2_cer(prediction, text):
    s=0
    for i in get_substrs(prediction):
        s=max(get_ratios(i, text), s)
    return s

In [38]:
rat=0
rat2=0
total=0
for k, i in train_df.iterrows():
    prediction=predict(i['IPA'])
    rat += get_ratios(prediction, i['text'])
    rat2 += get_top2_cer(prediction, i['text'])
    total+=1
    print(total, rat/total, rat2/total, end='                          \r')

1335 0.3952874449388913 0.40554023273926626                           

Получаем top 1 CER на тренировочной выборке 0.39, top 2 - 0.41

In [43]:
rat=0
rat2=0
total=0
for k, i in test_df.iterrows():
    prediction=predict(i['IPA'])
    rat += get_ratios(prediction, i['text'])
    rat2 += get_top2_cer(prediction, i['text'])
    total+=1
    print(total, rat/total, rat2/total, end='                          \r')

445 0.3673291647224549 0.3781902074702358                            

Получаем top 1 CER на тестовой выборке 0.36, top 2 - 0.37

In [None]:
import random

In [91]:
l=list(range(9))
random.shuffle(l)
l

[8, 1, 0, 5, 3, 2, 4, 7, 6]

In [95]:
def rc(p):
    l = []
    j=0
    for i in p:
        j+=1
        print(len(l), j, end='   \r')
        l.append(i)
        if random.random()<0.02 or len(l)>10000:
            c = random.choice(l)
            l.remove(c)
            if random.random()<0.01:
                for i in range(min(random.randint(1,20), len(l))):
                    c = random.choice(l)
                    l.remove(c)
                    yield c

def get_all_texts(inp: list[list[tuple[str, float]]], p=1, s=''):
    mp=0
    for ixs_ in range(10000000):#rc(product(range(len(inp[0])),repeat=len(inp))):
        
        ixs = list(random.randrange(0, len(inp[0])) for _ in range(len(inp)))
        

        s = ''
        p = 0.0
        for i1, i2 in enumerate(ixs):
            s+=inp[i1][i2][0]
            p+=inp[i1][i2][1]
            # if p<0.001*len(s):#/len(s)<0.1:
            #     break
        if p>=mp:
            yield s, p
            mp=p
mr=0
cer = 0
mcr =0
for i in get_all_texts(result):
    r = ratio(i[0], text)
    cer += i[1]*r
    if  r > mr or i[1]*r>mcr:
        mr = max(r, mr)
        mcr=max(r,mcr)
        print(i, i[1]*r, r)

('солеичсееиде', 0.18381928793867508) 0.10293880124565806 0.56


KeyboardInterrupt: 

Top 5 CER is about 0.7.

In [98]:
most_probable_phrase = [i[0][0] for i in result]
most_probable_phrase2 = [i[1][0] for i in result]

In [99]:
''.join(most_probable_phrase2)

'солеччснидне'

In [108]:
ratio(''.join(most_probable_phrase), 'солнечныйдень')

0.31999999999999995

In [105]:
mr=0
for i in df['text']:
    for j in (most_probable_phrase, most_probable_phrase2):
        i1=i
        i = ''.join( [k.lower() for k in i if k!=' '])
        r = ratio(i, j)
        if r> mr:
            mr=r
            print(i1, r)


радио 0.23529411764705888
это для обучения или экзамена 0.2702702702702703
этодляобученияилиэкзамена 0.32432432432432434
мне нужно добавить медицина 0.38888888888888884
снежинки 0.4
затрапезный вид 0.46153846153846156
артист невидим 0.48
на заднем сидении 0.5185185185185186
маленькие женщины 0.5714285714285714
солнечныйдень 0.64


In [None]:
print(ratio("зотсттуоднддеррсстазнновонеоноскреттеемаркоскеемнттел", 'записьаудиодляраспознаваниескрытыемарковскиемодели'))
print(ratio('тсстираовонненнаррткбитесстдделссраовннннимткрриркччсст', 'записьаудиодляраспознаваниескрытыемарковскиемодели'))

0.5436893203883495
0.34285714285714286


In [None]:
print(ratio("тсстираовонненнаррткбитесстдделссраовннннимткрриркччсст", 'записьаудиодляраспознаваниескрытыемарковскиемодели'))
print(ratio("тсстираовонненнаррткбитесстдделссраовннннимткрриркччсст", "тестированиенадругомтекстедлясравненияметриккачества"))

0.34285714285714286
0.6168224299065421
