# Get connection to the local sqlite database to prep for creating dataframe.


In [1]:
import sqlite3, os

con = sqlite3.connect("./Helper/data/wordsdb.db")
cur = con.cursor()
for row in cur.execute('SELECT * FROM words limit 5'):
    print(row)

(0, "'ALLO", 'AA L OW', 2, 'sp')
(1, "'BOUT", 'B AW T', 1, 'p')
(2, "'CAUSE", 'K AH Z', 1, 'u')
(3, "'COURSE", 'K AO R S', 1, 'p')
(4, "'CUSE", 'K Y UW Z', 1, 'p')


# Now to create the DataFrame object for us in analysis

In [2]:
import pandas as pd

df = pd.read_sql_query("SELECT * FROM words", con)
df.head()

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
0,0,'ALLO,AA L OW,2,sp
1,1,'BOUT,B AW T,1,p
2,2,'CAUSE,K AH Z,1,u
3,3,'COURSE,K AO R S,1,p
4,4,'CUSE,K Y UW Z,1,p


In [3]:
df.count()

index            134316
WORD             134316
PRONUNCIATION    134316
SYLLABLES        134316
SCANSION         134316
dtype: int64

## Avenues for analysis:
- Count of words 
    - total 
    - per starting letter 
    - per syllable count
    - per rhyme
    - per scansion foot pattern).
- Most common rhymes: for each number of syllables match.. i.e. last syllable, last 2 syllables etc..
- Occurence of stress patterns (by common feet used in poetry)

In [4]:
df.nunique()

index            134316
WORD             134316
PRONUNCIATION    114157
SYLLABLES            11
SCANSION            287
dtype: int64

We can see there are 134316 distinct entries in the database.

It appears we have just under 20,000 words with matching definitions. Finding which pronunciation(s) is most common would be interesting to see.

There are 11 different syllable counts, what words(s) has the highest syllable count?

In [5]:
df.loc[df['SYLLABLES'].idxmax()]

index                                                       117251
WORD                            SUPERCALIFRAGILISTICEXPIALIDOCIOUS
PRONUNCIATION    S UW P ER K AE L AH F R AE JH AH L IH S T IH K...
SYLLABLES                                                       14
SCANSION                                            susupusususupu
Name: 117251, dtype: object

In [6]:
# Do any other words match Supercalifragilisticexpialidocious's syllable count?
# df.dtypes

filter = df['SYLLABLES']
df.loc[filter == 14]

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
117251,117251,SUPERCALIFRAGILISTICEXPIALIDOCIOUS,S UW P ER K AE L AH F R AE JH AH L IH S T IH K...,14,susupusususupu


## Let's start getting the count of various things

In [7]:
word_count = df['WORD'].count()
word_count

134316

In [8]:
unique_starting_characters = set(word[0] for word in df['WORD'])
unique_starting_characters

{"'",
 '3',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z'}

In [9]:
# There's a word that starts with three?!
df.loc[df['WORD'].str.startswith('3')]

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
16,16,3-D,TH R IY D IY,2,ps
17,17,3D,TH R IY D IY,2,ps


That checks out. And look, a trochee (two stressed syllables in a row)!

In [10]:
# Finding the count of each starting letter. 
grouped_by_starting_letter =  df.groupby(df.WORD.str[0])
grouped_by_starting_letter.size()

WORD
'       16
3        2
A     7326
B     9749
C    10776
D     7756
E     4760
F     5240
G     5731
H     6457
I     3406
J     1677
K     4166
L     5524
M     9563
N     3220
O     2985
P     8242
Q      463
R     7311
S    14011
T     5660
U     1805
V     2341
W     4400
X       80
Y      731
Z      918
dtype: int64

In [11]:
grouped_by_syllable_count = df.groupby(df['SYLLABLES'])
grouped_by_syllable_count.size()

SYLLABLES
1     16974
2     61626
3     37259
4     13568
5      3916
6       822
7       131
8        16
9         2
12        1
14        1
dtype: int64

so that's why two-syllable words take longer in the app.

## Analysis of rhyme occurrence

As we can see, there are 11 different syllable counts. For this we'll need a function that finds each unique 1 syllable rhyme, 2 syllable rhyme etc... Then we can use that list to get counts of how many times the rhyme occurs.

Luckily, the app has a way to find these rhymes, finding all of the possibilities will require a few steps.

Tentative steps to take:
- get list of unique syllable counts
- For each number of syllables trying to match, return a DataFrame of all the words with that many, or more, syllables.
- Of those words, see which end with the same rhyme pattern.

In [15]:
# the relevant functions from Poetry and Lyric Helper

# def syllables_to_list(word_object: words) -> list:
#     """convert syllables of a word to a list of syllables to use for matching rhymes"""
#     pronunciation = word_object.PRONUNCIATION.split()
#     return pronunciation


# def syllable_to_match(pronunciation_list: list) -> str:
#     """Parses syllables list to find last syllable"""
#     rhyme = ''
#     i = len(pronunciation_list) - 1
#     while i >= 0:
#         if pronunciation_list[i][0] in ['A', 'E', 'I', 'O', 'U']:
#             rhyme = ' '.join(pronunciation_list[i:])
#             # pronunciation_list = pronunciation_list[:i]
#             break
#         else:
#             i -= 1
#     return rhyme


# def match_syllable(word_object: words, syllable: str, syllable_count_matches: list) -> list:
#     # results = words.query.filter(words.PRONUNCIATION.endswith(syllable), words.WORD != word_object.WORD).all()
#     results = [word for word in syllable_count_matches if
#                word.PRONUNCIATION.endswith(syllable) and word.WORD != word_object.WORD]
#     return  results


# def get_rhyme_dict(word_object: words, syllable_count_matches: list) -> dict:
#     """Given a word, return a dictionary with number of syllables rhymed as the key
#     and matching words as values"""
#     syllable_count = word_object.SYLLABLES
#     pronunciation_list = syllables_to_list(word_object)
#     rhyme = ''
#     results_dict = {}
#     i = 0
#     while i < syllable_count:
#         if i == 0:
#             temp = syllable_to_match(pronunciation_list)
#             rhyme = temp
#             results_dict[i+1] = match_syllable(word_object, rhyme, syllable_count_matches)
#             num_indexes_to_remove = len(temp.split())
#             pronunciation_list = pronunciation_list[:-num_indexes_to_remove]
#             i += 1
#         else:
#             temp = syllable_to_match(pronunciation_list)
#             rhyme = temp + ' ' + rhyme
#             value_list = match_syllable(word_object, rhyme, syllable_count_matches)
#             for word in value_list:
#                 if word in results_dict[i]:
#                     results_dict[i].remove(word)
#                 else:
#                     value_list.append(word)
#             results_dict[i+1] = value_list
#             if len(results_dict[i+1]) == 0:
#                 results_dict.pop(i+1)
#                 break
#             num_indexes_to_remove = len(temp.split())
#             pronunciation_list = pronunciation_list[:-num_indexes_to_remove]
#             i += 1
#     return results_dict


# def get_close_matches_rhyme(word_object: words, syllable_count_matches: list) -> dict:
#     """Given a word, searches the database and returns a dict of word objects where
#     rhyme matches are found at various syllable counts."""
#     rhyme_matches = get_rhyme_dict(word_object, syllable_count_matches)
#     close_matches_rhymes = {}
#     # for num in range(word_object.SYLLABLES): # number of syllables
#     for num in range(len(list(rhyme_matches.keys()))): # number of keys, to avoid out of range when matches weren't found.
#         syllable_match_list = details_list_to_word_list(syllable_count_matches)
#         rhyme_list = details_list_to_word_list(rhyme_matches[num + 1])
#         close_matches_rhymes[num+1] = [word for word in syllable_match_list if word in rhyme_list]
#         if len(close_matches_rhymes[num+1]) == 0:
#             close_matches_rhymes.pop(num+1)
#     return close_matches_rhymes

### First order of business is to get the list of all possible rhymes across the range of syllables to be matched.

In [12]:
syllable_count = sorted(df.SYLLABLES.unique())
syllable_count

[1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 14]

Then create a function to create a DataFrame with only that syllable count or higher.

In [13]:
def get_syllable_match_df(num_of_syllables):
    return df.loc[df['SYLLABLES'] >= num_of_syllables]

get_syllable_match_df(5).head()

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
26,26,A42128,EY F AO R T UW W AH N T UW EY T,6,pppppp
96,96,ABBATIELLO,AA B AA T IY EH L OW,5,uuupu
114,114,ABBREVIATED,AH B R IY V IY EY T IH D,5,upusu
116,116,ABBREVIATING,AH B R IY V IY EY T IH NG,5,upusu
117,117,ABBREVIATION,AH B R IY V IY EY SH AH N,5,usupu


Actually, we can probably use Pandas groups for matching rhymes & counts down the road.

In [14]:
grouped_by_syllable_count_size = df.groupby(df['SYLLABLES'])
grouped_by_syllable_count_size

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x10eb6d1e0>

Groups may be superfluous.

** let's return dataframes with >= number of syllables. Then parse the same number of syllables as the selected number. 
- first find all single syllable rhyme patterns.
- then find all second syllable rhyme patterns, instead of looping through like we're trying to find various syllable matches like in the app.
- return all those rhyme patterns to a list, then use that list to find words that str.endswith(pattern)

In [15]:
def get_pronunciation_string(word_row) -> str:
    """Convert word row to just a string of its pronunciation"""
    return word_row['PRONUNCIATION'].item()

def syllables_to_list(word: str) -> list:
    """convert syllables of a word to a list of syllables to use for matching rhymes"""
    pronunciation = word.split()
    return pronunciation

test = df.loc[df['WORD'] == 'BRETHREN']
print(type(test))
# print(test)
pronunciation = test['PRONUNCIATION'].item()
print(pronunciation)
pronunciation = get_pronunciation_string(test)
print(syllables_to_list(pronunciation))

<class 'pandas.core.frame.DataFrame'>
B R EH DH R AH N
['B', 'R', 'EH', 'DH', 'R', 'AH', 'N']


Now we have functions to;
- return a DataFrame with words matching syllable counts we need.
- return a row's pronunciation
- return a that pronunciation as a list.

From here we can write the function to find the rhyme pattern we need.

Then use that pattern to match rows in the DataFrame.

** a potential problem with going the route I am is that if there are any words of higher syllable counts that end with a sound not found in any of the lower syllable count words, the final analysis may not be accurate. Something to think about, thought I may be overthinking it. 

** Nevermind, I will just let the functions take the time to parse the last syllable of higher count words as well, instead of just parsing only words matching the syllable count passed in to syllable_match function.

In [16]:
# since we're not always taking the last syllable, we'll also pass in a number of syllables to grab
# so it will be slightly different than the code commented out above.

def syllable_to_match(pronunciation_list: list, num_of_syllables: int) -> str:
    """Parses a pronunciation list """
    rhyme = ''
    syllables_found = 0
    i = len(pronunciation_list) - 1
    while i >= 0 and syllables_found < num_of_syllables:
        if pronunciation_list[i][0] in ['A', 'E', 'I', 'O', 'U']:
            syllables_found += 1
            i -= 1
            if syllables_found == num_of_syllables:
                rhyme = ' '.join(pronunciation_list[i+1:])
                break
        else:
            i -= 1
    return rhyme

In [17]:
test_word = df.loc[df['WORD'] == 'SUPERCALIFRAGILISTICEXPIALIDOCIOUS']
test_word = get_pronunciation_string(test_word)
test_word = syllables_to_list(test_word)
# print(test_word)

# let's see how it works

print(syllable_to_match(test_word, 14))

UW P ER K AE L AH F R AE JH AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S


Almost all the pieces. Now to write a function that returns a list of rhyme patterns found across syllable counts.

In [18]:
syllable_count_list = list(grouped_by_syllable_count.groups.keys())
print(syllable_count_list)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 14]


In [19]:
def get_list_of_rhyme_patterns(dataframe, syllable_cnt_list) -> set:
    output = set()
    for count in syllable_cnt_list:
        syl_df = get_syllable_match_df(count)
        for i in range(len(syl_df)):
            row = syl_df.iloc[[i]]
            pronunciation = get_pronunciation_string(row)
            pronunciation = syllables_to_list(pronunciation)
            rhyme_pattern = syllable_to_match(pronunciation, count)
            output.add(rhyme_pattern)
    return output

test_lst = [9, 14]
test_rhyme_pattern_list = get_list_of_rhyme_patterns(df, test_lst)
print(test_rhyme_pattern_list)

{'UW P ER K AE L AH F R AE JH AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S', 'AH S T AE B L IH SH M AH N T EH R IY AH N IH Z AH M', 'IY IH N S T IH T UW SH AH N AH L AH Z EY SH AH N', 'AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S', 'EH K S T R AH T EH R AH T AO R IY AE L AH T IY'}


Let's make the complete set of rhyme patterns found in our database!

In [20]:
rhyme_patterns = get_list_of_rhyme_patterns(df, syllable_count_list)

In [22]:
print(type(rhyme_patterns))
print(len(rhyme_patterns))

<class 'set'>
96360


96360 different rhyme patterns cumulatively found across the various syllable counts! That's quite high.

If we wanted to see the most common syllable matches for each syllable count, we could return the rhyme_patterns as a dict instead of a set. It may be worthwhile to do that, just for easily analysis later.

In [23]:
def get_list_of_rhyme_patterns(dataframe, syllable_cnt_list) -> dict:
    output = {}
    for count in syllable_cnt_list:
        value_set = set()
        syl_df = get_syllable_match_df(count)
        for i in range(len(syl_df)):
            row = syl_df.iloc[[i]]
            pronunciation = get_pronunciation_string(row)
            pronunciation = syllables_to_list(pronunciation)
            rhyme_pattern = syllable_to_match(pronunciation, count)
            value_set.add(rhyme_pattern)
        output[count] = sorted(value_set)
    return output

test_lst = [9, 14]
test_rhyme_pattern_list = get_list_of_rhyme_patterns(df, test_lst)
print(test_rhyme_pattern_list)

{9: ['AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S', 'AH S T AE B L IH SH M AH N T EH R IY AH N IH Z AH M', 'EH K S T R AH T EH R AH T AO R IY AE L AH T IY', 'IY IH N S T IH T UW SH AH N AH L AH Z EY SH AH N'], 14: ['UW P ER K AE L AH F R AE JH AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S']}


That looks much better! And it will make it easier to analyze later.

In [24]:
rhyme_patterns = get_list_of_rhyme_patterns(df, syllable_count_list)

In [25]:
# our keys should match the syllable count list
print(rhyme_patterns.keys())

# let's make sure we returned the same number of patterns.
sum(len(lst) for lst in rhyme_patterns.values())

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 14])


96360

Now to get the count of rhymes per pattern so we can see which rhyme patterns are most common.

The question is whether to only match words with the same number of syllables or the same or higher number of syllables.

Or, perhaps both.

In [26]:
def get_exact_syllable_match_df(num_of_syllables):
    return df.loc[df['SYLLABLES'] == num_of_syllables]

In [131]:
def get_rhyme_counts(rhyme_patterns, dataframe) -> dict:
    output_dict = {}
    for key in rhyme_patterns:
        df = get_exact_syllable_match_df(key)
        output_dict[key] = []
        for pattern in rhyme_patterns[key]:
            number_of_matches = len(df.loc[df['PRONUNCIATION'].str.endswith(pattern) | df['PRONUNCIATION'].str.fullmatch(pattern)].index)
            temp_tuple = (pattern, number_of_matches)
            output_dict[key].append(temp_tuple)
    return output_dict

In [29]:
for key in rhyme_patterns:
    print(len(rhyme_patterns[key]))

1397
34930
38724
15728
4488
925
141
20
4
2
1


In [132]:
# quick check for the above function
df.loc[df['PRONUNCIATION'].str.endswith('AE L AH T IY')]
# len(df.index)

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
263,263,ABNORMALITY,AE B N AO R M AE L AH T IY,5,supuu
915,915,ACTUALITY,AE K CH UW AE L AH T IY,5,supus
11867,11867,BISEXUALITY,B AY S EH K SH UW AE L AH T IY,6,suupuu
15777,15777,BRUTALITY,B R UW T AE L AH T IY,4,upuu
23499,23499,COMMONALITY,K AA M AH N AE L AH T IY,5,supuu
24122,24122,CONFIDENTIALITY,K AA N F AH D EH N SH IY AE L AH T IY,7,susupuu
24209,24209,CONGENIALITY,K AH N JH IY N IY AE L AH T IY,6,usupuu
26785,26785,CRITICALITY,K R IH T AH K AE L AH T IY,5,supuu
32107,32107,DIMENSIONALITY,D IH M EH N SH AH N AE L AH T IY,6,usupuu
34854,34854,DUALITY,D UW AE L AH T IY,4,upuu


In [133]:
# test_dict = get_rhyme_counts(rhyme_patterns, df)
exact_dict = get_rhyme_counts(rhyme_patterns, df)
print(exact_dict.keys())

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 14])


In [112]:
print(test_dict)

{1: [('EY', 198), ('OW', 198), ('UW', 198), ('IY', 178), ('IY Z', 156), ('AY', 149), ('AA', 135), ('EH', 134), ('EY N', 131), ('UW Z', 128), ('EY Z', 123), ('AA R', 119), ('AO R', 118), ('EH R', 116), ('AA K', 107), ('IH N', 107), ('IY L', 107), ('ER', 106), ('AO', 99), ('AE N', 98), ('EH N', 98), ('AE K', 97), ('AW', 97), ('EH L', 96), ('OW L', 96), ('EY L', 92), ('OW Z', 90), ('AA N', 84), ('AE M', 81), ('IY N', 81), ('AE', 77), ('AY Z', 77), ('EH K', 77), ('EY N Z', 77), ('AE K S', 76), ('OW N', 76), ('AO L', 75), ('IH K', 75), ('IH L', 74), ('IH R', 74), ('EY L Z', 73), ('IH T S', 70), ('IY D', 69), ('AA T', 68), ('IY K', 68), ('AA T S', 67), ('AH M', 66), ('EY D', 66), ('IY T S', 66), ('AA L', 65), ('OW L Z', 65), ('AY N', 64), ('EY T', 64), ('EH R Z', 63), ('IH', 62), ('IY S', 62), ('AA P', 61), ('AE N Z', 61), ('IY T', 61), ('OW K', 61), ('AE P', 59), ('AH N', 59), ('AE S', 57), ('IH NG K', 57), ('IH T', 57), ('AA M', 56), ('AY T', 56), ('IH P', 56), ('UW L', 56), ('AE T', 55), 

In [134]:
# print(test_dict[8])
test_df = get_exact_syllable_match_df(8)
test_df.head()
# test_df.loc[test_df['PRONUNCIATION'].str.endswith('N S T IH T UW SH AH N AH L AH Z EY SH AH N') | df['PRONUNCIATION'] == 'IH N S T IH T UW SH AH N AH L AH Z EY SH AH N']

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
6892,6892,AUTHORITARIANISM,AH TH AO R AH T EH R IY AH N IH Z AH M,8,usupuusu
25938,25938,COUNTERREVOLUTIONARY,K AW N T ER R EH V AH L UW SH AH N EH R IY,8,suuupusu
30524,30524,DEOXYRIBONUCLEIC,D IY AA K S IY R AY B OW N UW K L EY IH K,8,ususuupu
36391,36391,EGALITARIANISM,IY G AE L AH T EH R IY AH N IH Z AH M,8,usupuusu
38345,38345,ENTREPRENEURIALISM,EH N T R AH P R AH N UW R IY AH L IH Z AH M,8,suuspusu


In [72]:
pat = 'AE N AH S TH IY Z IY AA L AH JH IH S'
df.loc[df['PRONUNCIATION'].str.endswith(pat) | df['PRONUNCIATION'].str.match(pat)]

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
3842,3842,ANAESTHESIOLOGIST,AE N AH S TH IY Z IY AA L AH JH IH S T,7,susupuu
3843,3843,ANAESTHESIOLOGISTS,AE N AH S TH IY Z IY AA L AH JH IH S T S,7,susupuu
3844,3844,ANAESTHESIOLOGISTS(1),AE N AH S TH IY Z IY AA L AH JH IH S,7,susupuu
4089,4089,ANESTHESIOLOGIST,AE N AH S TH IY Z IY AA L AH JH IH S T,7,susupuu
4090,4090,ANESTHESIOLOGISTS,AE N AH S TH IY Z IY AA L AH JH IH S T S,7,susupuu
4091,4091,ANESTHESIOLOGISTS(1),AE N AH S TH IY Z IY AA L AH JH IH S,7,susupuu


In [62]:
test_dict[9]

[('AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S', 0),
 ('AH S T AE B L IH SH M AH N T EH R IY AH N IH Z AH M', 0),
 ('EH K S T R AH T EH R AH T AO R IY AE L AH T IY', 1),
 ('IY IH N S T IH T UW SH AH N AH L AH Z EY SH AH N', 1)]

For this test_dict, there are some tuple with zero matches, this is due to only matching based on number of syllables and not how we assembled the rhyme patterns earlier (pulling regardless of syllable count in case of unique patterns found in longer words). When matching regardless of syllable count, these will at least have a value of 1 for occurrences.

In [68]:
# sort(test_dict[1],key=lambda x : x[1], reverse=True)
test_dict[1].sort(key = lambda x : x[1], reverse = True)
print(test_dict[1])

[('EY', 198), ('OW', 198), ('UW', 198), ('IY', 178), ('IY Z', 156), ('AY', 149), ('AA', 135), ('EH', 134), ('EY N', 131), ('UW Z', 128), ('EY Z', 123), ('AA R', 119), ('AO R', 118), ('EH R', 116), ('AA K', 107), ('IH N', 107), ('IY L', 107), ('ER', 106), ('AO', 99), ('AE N', 98), ('EH N', 98), ('AE K', 97), ('AW', 97), ('EH L', 96), ('OW L', 96), ('EY L', 92), ('OW Z', 90), ('AA N', 84), ('AE M', 81), ('IY N', 81), ('AE', 77), ('AY Z', 77), ('EH K', 77), ('EY N Z', 77), ('AE K S', 76), ('OW N', 76), ('AO L', 75), ('IH K', 75), ('IH L', 74), ('IH R', 74), ('EY L Z', 73), ('IH T S', 70), ('IY D', 69), ('AA T', 68), ('IY K', 68), ('AA T S', 67), ('AH M', 66), ('EY D', 66), ('IY T S', 66), ('AA L', 65), ('OW L Z', 65), ('AY N', 64), ('EY T', 64), ('EH R Z', 63), ('IH', 62), ('IY S', 62), ('AA P', 61), ('AE N Z', 61), ('IY T', 61), ('OW K', 61), ('AE P', 59), ('AH N', 59), ('AE S', 57), ('IH NG K', 57), ('IH T', 57), ('AA M', 56), ('AY T', 56), ('IH P', 56), ('UW L', 56), ('AE T', 55), ('AY

In [136]:
# let's sort those lists!

for key in exact_dict:
    exact_dict[key].sort(key = lambda x:x[1], reverse=True)

print(exact_dict[3])

[('AH L IH N S K IY', 19), ('AO R IY AH', 19), ('EH S AH L ER', 15), ('EH N IH NG ER', 14), ('EH R IY ER', 14), ('AH S IH N S K IY', 13), ('EH R IH NG T AH N', 13), ('IH S AH L ER', 13), ('AY N IH NG ER', 12), ('EH R IY AH N', 12), ('AA L IH S AH N', 11), ('EH R IH K S AH N', 11), ('AH CH IH N S K IY', 10), ('AH T ER IH NG', 10), ('AY D AH L ER', 10), ('AY S IH N JH ER', 10), ('EH L IH NG ER', 10), ('EH N IH S AH N', 10), ('IH L IY AH N', 10), ('OW L IH NG ER', 10), ('AA S AH L ER', 9), ('AE L ER IY', 9), ('AE T ER IH NG', 9), ('AH B IH N S K IY', 9), ('AH JH IH N S K IY', 9), ('AH L IY N AH', 9), ('AH M B AH L IH NG', 9), ('EH R IH NG ER', 9), ('EY Z IY ER', 9), ('IH L IH N S K IY', 9), ('IY AE N AH', 9), ('OW Z IY ER', 9), ('AH K AO F S K IY', 8), ('AW ER IH NG', 8), ('EH M ER IY', 8), ('EH R IH T IY', 8), ('IH L ER IY', 8), ('IH L IH NG ER', 8), ('IH NG G AH L ER', 8), ('IH S IH N JH ER', 8), ('IH T AH K ER', 8), ('AA L AH N D ER', 7), ('AA L IH NG ER', 7), ('AA N IY AH', 7), ('AA R

In [138]:
limit = 10
top_ten_exact_dict = {}
for key in exact_dict:
    top_ten_exact_dict[key]=[]
    if len(exact_dict[key]) >= 10:
        for num in range(limit):
            top_ten_exact_dict[key].append(exact_dict[key][num])
    else:
        for num in range(len(exact_dict[key])):
            top_ten_exact_dict[key].append(exact_dict[key][num])
            
print(top_ten_exact_dict[6])
        

[('AA N AE G R IH K AH L CH ER AH L', 2), ('AA N D IH S K R IH M IH N EY SH AH N', 2), ('AA N IH N F L EY SH AH N EH R IY', 2), ('AA N M AE N Y AH F AE K CH ER IH NG', 2), ('AA R K IY AH L AA JH IH K AH L', 2), ('AH L T R AH N AE SH AH N AH L AH S T S', 2), ('AY OW T EH K N AA L AH JH IY Z', 2), ('EH L F AO R G AH N IH Z EY SH AH N', 2), ('EH N AH F IH SH IY EH R IY Z', 2), ('EH N D OW K R AH N AA L AH JH AH S T S', 2)]


In [139]:
# let's make a dataframe from the results dict

# exact_top_ten_df = pd.concat({k:pd.Series(v) for k, v in top_ten_exact_dict.items()}).unstack().astype(float).sort_index()
# exact_top_ten_df.columns = 'col1  col2  col3'.split()

L = [(k, *t) for k, v in top_ten_exact_dict.items() for t in v]
exact_top_ten_df = pd.DataFrame(L, columns=['Syllable Count','Rhyme Pattern','Occurrences'])

In [140]:
exact_top_ten_df

Unnamed: 0,Syllable Count,Rhyme Pattern,Occurrences
0,1,UW,188
1,1,IY Z,155
2,1,OW,149
3,1,IY,142
4,1,EY,134
...,...,...,...
82,9,AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S,0
83,9,AH S T AE B L IH SH M AH N T EH R IY AH N IH Z...,0
84,12,AE N T AY D IH S AH S T AE B L IH SH M AH N T ...,1
85,12,AE L AH F R AE JH AH L IH S T IH K EH K S P IY...,0


In [141]:
# we know the occurrences with 0s & 1s are either not found or matching themselves. so let's drop those rows.

exact_top_ten_df = exact_top_ten_df[exact_top_ten_df['Occurrences'] > 1]

In [142]:
exact_top_ten_df.tail()
# exact_top_ten_df.head()

Unnamed: 0,Syllable Count,Rhyme Pattern,Occurrences
63,7,AE N AH S TH IY Z IY AA L AH JH IH S T S,2
64,7,AE N AH S TH IY Z IY AA L AH JH IY,2
65,7,EH L AH K AH M Y UW N AH K EY SH AH N Z,2
66,7,EY D IY OW AE K T IH V AH T IY,2
67,7,IY N AE SH AH N AH L IH Z EY SH AH N,2


This dataframe is still not the most accurate. As we saw above, 'ANAESTHESIOLOGIST' has multiple spellings. This does not mean the word is easy to match with. For the longer syllable words I may check manually to see when a word is matching itself / pluralized self.

Let's persist some of this work to avoid having to re-run code during the next exploration.

In [143]:
import pickle

with open(r"Exploration/exactTopTen.pickle", 'wb') as f:
    pickle.dump(exact_top_ten_df, f)

In [144]:
with open(r"Exploration/exactTopTen.pickle", "rb") as f:
    test_pickle = pickle.load(f)
test_pickle

Unnamed: 0,Syllable Count,Rhyme Pattern,Occurrences
0,1,UW,188
1,1,IY Z,155
2,1,OW,149
3,1,IY,142
4,1,EY,134
...,...,...,...
63,7,AE N AH S TH IY Z IY AA L AH JH IH S T S,2
64,7,AE N AH S TH IY Z IY AA L AH JH IY,2
65,7,EH L AH K AH M Y UW N AH K EY SH AH N Z,2
66,7,EY D IY OW AE K T IH V AH T IY,2


In [104]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(test_pickle)

    Syllable Count                             Rhyme Pattern  Occurrences
0                1                                        EY          198
1                1                                        OW          198
2                1                                        UW          198
3                1                                        IY          178
4                1                                      IY Z          156
5                1                                        AY          149
6                1                                        AA          135
7                1                                        EH          134
8                1                                      EY N          131
9                1                                      UW Z          128
10               2                                   EH R IY           97
11               2                                     AY ER           87
12               2                    

In [129]:
def see_pattern_matches(pattern, syl):
    """just for checking our top_ten dict"""
    df = get_exact_syllable_match_df(syl)
    return df.loc[df['PRONUNCIATION'].str.endswith(pattern) | df['PRONUNCIATION'].str.fullmatch(pattern)]


pat = "AY B IH R IY AH"
see_pattern_matches(pat, 4)

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
57869,57869,IBERIA,AY B IH R IY AH,4,upuu
69967,69967,LIBERIA,L AY B IH R IY AH,4,spsu
110236,110236,SIBERIA,S AY B IH R IY AH,4,upuu


Rows to delete:
