In [2]:
import nltk
from nltk import CFG
from nltk import grammar, parse
from nltk.parse import load_parser
from nltk.parse.generate import generate
import random
import re
import operator
import editdistance

In [3]:
s5 = '/projects/speech/ASR/kaldi/egs/librispeech/s5-mats'
data = s5 + '/data'
exp = s5 + '/exp'
train21 = data + '/train21'

g1_path = s5 + '/grammar/g1.fcfg'

truth_path = train21 + '/truth'

p1 = load_parser(g1_path, trace=0,cache=False)

def sentence_parses(s,parser):
    try:
        if next(parser.parse(s)):
            return True
    except:
        return False

p1

<nltk.parse.featurechart.FeatureChartParser at 0x7f444823afa0>

#### Illustrate the parser
Trees don't work with this Python and NLTK version.  Notice that the grammar uses non-capitalized terminals.

In [4]:
print(next(p1.parse('every vowel is capitalized'.split())))

(S[SEM=<all n.(exists c.(vowel(c) & char(n,c)) -> exists c.(capital(n) & char(n,c)))>]
  (DP[NUM='sg', SEM=<\Q.all n.(exists c.(vowel(c) & char(n,c)) -> Q(n))>, STR='yes']
    (Det[NUM='sg', SEM=<\P Q.all n.(P(n) -> Q(n))>, STR='yes'] every)
    (N[NUM='sg', SEM=<\n.exists c.(vowel(c) & char(n,c))>] vowel))
  (VP[NUM='sg', SEM=<\n.exists c.(capital(n) & char(n,c))>]
    (V[NUM='sg', SEM=<\n.exists c.char(n,c)>] is)
    (A[SEM=<\n.exists c.(capital(n) & char(n,c))>] capitalized)))


#### Best sentences
Some of this modified from Elise Kronblicher's 2021 project report.

Read into a list the results of `lattice-to-nbest` and `nbest-to-linear`, with an $n$ of 100. See the earlier notebook.

In [5]:
! head -4 $LIBRISPEECH/exp/tri6b/decode_tgsmall_train21F.si/text.100

aiden-a-1-1 AT LEAST ONE VOWEL IS CAPITALIZED 
aiden-a-1-2 AT LEAST ONE VOWELS CAPITALIZED 
aiden-a-10-1 THERE IS EXACTLY ONE THE T 
aiden-a-10-2 THERE IS EXACTLY AT ONE T 


In [7]:
best_100_path = exp + '/tri6b/decode_tgsmall_train21F.si/text.100'
with open(best_100_path) as f:
    best_100_lines=f.readlines()
f.close()

There are a lot, because there are up to 100 readings for each audio.

In [12]:
print(len(best_100_lines))
best_100_lines[0:8]

20724


['aiden-a-1-1 AT LEAST ONE VOWEL IS CAPITALIZED \n',
 'aiden-a-1-2 AT LEAST ONE VOWELS CAPITALIZED \n',
 'aiden-a-10-1 THERE IS EXACTLY ONE THE T \n',
 'aiden-a-10-2 THERE IS EXACTLY AT ONE T \n',
 'aiden-a-10-3 THERE IS A EXACTLY ONE THE T \n',
 'aiden-a-10-4 THERE IS THE EXACTLY ONE THE T \n',
 'aiden-a-10-5 THERE IS A EXACTLY AT ONE T \n',
 'aiden-a-10-6 THERE IS EXACTLY ONE T \n']

Create lists of UIDs with a parse, and the readings that parsed.  (Elise added the first option if there was no reading that parsed, this is left out here.)
This run took ten minutes.

In [14]:
idx=[]
for s in best_100_lines:
    id='-'.join(s.split('-')[0:3])
    split=s.split()
    idx+=[[id,split[0],[s.lower() for s in split[1:]]]]

good_uid=[]
good_reading=[]
for i in idx:
    if i[0] not in good_uid:
        if sentence_parses(i[2],p1):
            good_uid+=[i[0]]
            good_reading+=[' '.join(w.upper() for w in i[2])]
#for i in idx:
#    if i[0] not in files:
#        files+=[i[0]]
#        sents+=[' '.join(w.upper() for w in i[2])]
print(good_uid[0:3],good_reading[0:3])

['aiden-a-1', 'aiden-a-2', 'aiden-a-3'] ['AT LEAST ONE VOWEL IS CAPITALIZED', 'SOME VOWEL IS NOT CAPITALIZED', 'EVERY GLIDE PRECEDES THE VOWEL']


In [15]:
print(len(good_uid))

182


This is kind of medium, a reading that parses if found for 182 out of 349 items.

In [17]:

# This saves the results of the slow code above.  It has the best reading for each uid.
# It leaves out the items that did not have a parsing option.
f = open(train21 + 'recognized', "w")
for i in range(len(good_uid)):
    f.write(good_uid[i]+" "+good_reading[i]+"\n")
f.close()



## Code for models

In [18]:
import nltk
from nltk import grammar, parse, load_parser
from typing import Callable, List, Set
import re

###This code from Akash Aryal's model generator 

vowels = ['a', 'e', 'i', 'o', 'u']
fricatives = ['v', 'f', 's', 'z', 'h', 'th', 'sh', 'zh']
              
#def vowel(word,i):
#    return word[i].lower() in vowels
#get_vowel = lambda word: f'vowel => {set([i+1 for i in range(len(word)) if vowel(word,i)])}'

def capital(word, i):
    return word[i].isupper()
get_capital = lambda word: f'capital => {set([i+1 for i in range(len(word)) if capital(word,i)])}'

def less_than(i, j):
    return i<j
get_less_than = lambda word: f'le => {set([(i+1,j+1) for i in range(len(word)) for j in range(len(word)) if i!=j and less_than(i+1,j+1)])}'

def adjacent(i,j):
    return abs(i-j) == 1
get_adjacent = lambda word: f'ad => {set([(i+1,j+1) for i in range(len(word)) for j in range(len(word)) if adjacent(i+1,j+1)])}'

get_even =  lambda word: f'even => {set([i+1 for i in range(len(word)) if (i+1)%2==0])}'

get_odd = lambda word: f'odd => {set([i+1 for i in range(len(word)) if (i+1)%2!=0])}'

# @323
def voiced(word):
    word=word.lower()
    v=[]
    for i in range(len(word)):
        if i != len(word)-1:
            if (word[i] == 'n' and word[i+1]=='g') or (word[i] == 's' and word[i+1] == 'z'):
                v.append((i+1, word[i]))
                v.append((i+2, word[i+1]))
        if word[i] == 'z':
            if ((i,'s') not in v):
                v.append((i+1, 'z'))
        if word[i] == 'g':
            if ((i,'n') not in v):
                v.append((i+1, 'g'))
        if word[i] == 'n':
            if ((i+2,'g') not in v):
                v.append((i+1,'n'))
        if word[i] in ['a', 'e', 'i', 'o', 'u', 'b', 'd', 'j', 'l', 'm', 'r', 'v', 'w', 'y']:
            v.append((i+1, word[i]))
    return v
            
get_voiced= lambda w: f'voiced => {set([(i+1,w[i].lower()) for i in range(len(w)) if (i+1, w[i].lower()) in voiced(w)])}'

def centered(word,i):
    if (len(word)%2==0):
        return len(word)//2 == i or i == len(word)//2 + 1
    else:
        return len(word)//2 + 1 == i

get_centered = lambda word: f'cent => {set([i+1 for i in range(len(word)) if centered(word,i+1)])}'

get_mirrored = lambda w: f'mirrored => {set(i+1 for i in range(len(w)) if w[i].lower() == w[len(w)-1-i].lower())}'

get_glide =  lambda w: f'glide => {set(i+1 for i in range(len(w)) if w[i].lower() == "w" or w[i].lower() == "y")}'

# from @325
def fricatives(word):
    word.lower()
    frics = []
    for i in range(len(word)):
        if i != len(word)-1:
            if word[i] in ['t', 's', 'z'] and word[i+1] == 'h':
                frics.append((i+1, word[i]))
                frics.append((i+2, 'h'))
        if word[i] == 'h':
            if ((i, 't') not in frics) and ((i, 's') not in frics) and ((i, 'z') not in frics):
                frics.append((i+1, 'h'))
        if word[i] in ['s', 'z']:
            if ((i+2, 'h') not in frics):
                frics.append((i+1, word[i]))
        if word[i] in ['v', 'f']:
            frics.append((i+1, word[i]))
        return frics
get_fricative = lambda w: f'fricative => {set([(i+1,w[i].lower()) for i in range(len(w)) if (i+1, w[i].lower()) in fricatives(w)])}'

##Code by Angela Liu from HW3
def to_model_str(word: str, special_rels: List[Callable[[str], str]]=[]) -> str:
    """
    Creates the string form of the model for the input word. This string is meant to be passed to `nltk.Valuation.fromstring`.
    By default, the function will only add the relations mapping i => i for i from 1 to the length of `word` and a relation 
    mapping char => the set of tuples (i, word[i]). The `special_rels` function allows you to specify additional relations to 
    be added to the valuation string.
    
    :param word: The word to create a model string for.
    :param special_rels: A list of functions that when called return a string of the form {relation_name} => {relation_contents}. Defaults to the empty list.
    :returns: a string representing the model for word
    """
    n = len(word)
    model_str = []
    char = []
    for i in range(1, n+1):
        model_str.append(f'{i} => {i}')
        char.append((i, word[i-1]))
    model_str.append(f'char => {set(char)}'.lower())
    return '\n'.join(model_str + [rel(word) for rel in special_rels]).replace("'", "")
    # Angela Liu

def emptysets(val:nltk.sem.evaluate.Valuation):
    val.update([(k,set()) for (k,v) in val.items() if v == 'set()'])
    



In [24]:
print(get_capital('aDvOcAtEf'))
print(to_model_str("bAtSf",[get_capital, get_less_than, get_adjacent, get_even, get_odd, 
                                                   get_voiced, get_centered, get_mirrored, get_glide, get_fricative]))

capital => {8, 2, 4, 6}
1 => 1
2 => 2
3 => 3
4 => 4
5 => 5
char => {(4, s), (2, a), (1, b), (5, f), (3, t)}
capital => {2, 4}
le => {(2, 4), (1, 2), (3, 4), (1, 5), (1, 4), (2, 3), (4, 5), (2, 5), (1, 3), (3, 5)}
ad => {(1, 2), (2, 1), (3, 4), (4, 3), (5, 4), (2, 3), (4, 5), (3, 2)}
even => {2, 4}
odd => {1, 3, 5}
voiced => {(1, b), (2, a)}
cent => {3}
mirrored => {3}
glide => set()
fricative => set()


### Truth checking
From E.K.

In [25]:
truth_path = train21 + '/truth'
result_path = s5 + '/result/result1'
print(truth_path)
print(result_path)

/projects/speech/ASR/kaldi/egs/librispeech/s5-mats/data/train21/truth
/projects/speech/ASR/kaldi/egs/librispeech/s5-mats/result/result1


In [26]:
def check_truth(truthpath,resultpath,grammarpath):
    """
    Compares the truth values from user's generated tests to those produced from from the truth page. Iterates through the truth file
    one sentence at a time
     
    
    :param: Results are in the same format as a text file"
    """
    
    ##User needs to provide their own truth and text paths
    
    gram_lst=[]
    after_assign=[]
    after_calc=[]
    before_calc=[]
    text = open(resultpath)
    truth = open(truthpath)
    sentences = []
    truthTests = []
    for l, t in zip(text, truth):
        truthTests.append(t.strip('\n').split(" "))
        sentences.append(l.split(" ", 1)[0])
        sentences.append(l.split(" ", 1)[1].strip('\n').strip())
    
    pr = nltk.load_parser(grammarpath, trace=0,cache=False)
    #Magic, do not touch! Cannot parse sentences without this line!
    get_vowel = lambda w: f'vowel => {set(re.findall(r"[AEIOUaeiou]", w))}'.lower()
    
    numCorrect = 0 
    numGuessedCorrect=0
    numTests = 0
    for t in truthTests:
        words = []
        truths = []
        sentence = sentences[sentences.index(t[0])+1].lower()
        print(sentence)
        
        for i in range(1, len(t)):
                if (i % 2 == 1):
                    words.append(t[i])
                    
                else :
                    if(t[i] == 't'): 
                        truths.append('True')
                        numTests+=1
                    elif(t[i] == 'f'): 
                        truths.append('False')
                        numTests+=1
                    elif(t[i] == 'u'): truths.append('Undefined')
                    else: print("Check the format of your truth file, "+t[i])
        try:
            formula = next(pr.parse(sentence.split())).label()['SEM']
            gram_lst+=[sentence]
               
            equals = lambda w: f'eq => {set([(i+1,i+1) for i in range(len(w))])}'
            vals = [nltk.Valuation.fromstring(to_model_str(w, [get_vowel, get_capital, get_less_than, get_adjacent, get_even, get_odd, 
                                                   get_voiced, get_centered, get_mirrored, get_glide, get_fricative])) for w in words]
            assignments = [nltk.Assignment(val.domain) for val in vals]
            for val in vals: emptysets(val)
            models = [nltk.Model(val.domain, val) for val in vals]
            after_assign+=[sentence]
            for w, a, m, t in zip(words, assignments, models, truths):
                
                calculated = m.evaluate(str(formula),a)
                
                #I left in some useful print statements if you want to see a breakdown of the results
                #print(f'{w}\nexpected: {t} calculated: {calculated}\n----------------\n')
                if(t != "Undefined"):
                    #print(f"truth file: {t} calculated: {calculated}")
                    if(t == str(calculated)): numCorrect+=1
                
            
        except:
            for t in truths:
                if t=='False': numGuessedCorrect+=1
#             print("\nYour sentence '" + sentence + "' is ungrammatical")
            
        
    print(f"\n{numCorrect+numGuessedCorrect} out of {numTests}\nTruth value error rate: {1-(numCorrect+numGuessedCorrect)/numTests}")
    print(numCorrect)
    print(numGuessedCorrect)
    print(numTests)
    

In [27]:
check_truth(truth_path,result_path,g1_path)

there is exactly one the t
at least one vowel is capitalized
some vowel is not capitalized
every glide precedes the vowel
the letter or that precedes that letter five is a vowel
letter three is unique
the penultimate letter is not a vowel
no consonant is unique
there is at least one glide
only letter two is unique
all mirrored letters are not capitalized
some unique letter is capitalized
there is at least one vowel
more than one p is mirrored
every even letter is a vowel
a letter three follows some the unique consonant
only letter two immediately precedes a vowel
every penultimate letter is a vowel
there is some glide that follows letter two
no unique vowel is centered
there is more than one vowel
at least one letter all most glides
every consonant is penultimate or consonantal
every vowel is unique and final
letter one is unique
the letter one is all voiced four more k i only
letter or two centered or repeated
more than one vowel is capital
no consonant immediately follows vowels
some