# Get connection to the local sqlite database to prep for creating dataframe.


In [4]:
import sqlite3, os

con = sqlite3.connect("./Helper/data/wordsdb.db")
cur = con.cursor()
for row in cur.execute('SELECT * FROM words limit 5'):
    print(row)

(0, "'ALLO", 'AA L OW', 2, 'sp')
(1, "'BOUT", 'B AW T', 1, 'p')
(2, "'CAUSE", 'K AH Z', 1, 'u')
(3, "'COURSE", 'K AO R S', 1, 'p')
(4, "'CUSE", 'K Y UW Z', 1, 'p')


# Now to create the DataFrame object for us in analysis

In [5]:
import pandas as pd

df = pd.read_sql_query("SELECT * FROM words", con)
df.head()

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
0,0,'ALLO,AA L OW,2,sp
1,1,'BOUT,B AW T,1,p
2,2,'CAUSE,K AH Z,1,u
3,3,'COURSE,K AO R S,1,p
4,4,'CUSE,K Y UW Z,1,p


In [6]:
df.count()

index            134316
WORD             134316
PRONUNCIATION    134316
SYLLABLES        134316
SCANSION         134316
dtype: int64

## Avenues for analysis:
- Count of words 
    - total 
    - per starting letter 
    - per syllable count
    - per rhyme
    - per scansion foot pattern).
- Most common rhymes: for each number of syllables match.. i.e. last syllable, last 2 syllables etc..
- Occurence of stress patterns (by common feet used in poetry)

In [7]:
df.nunique()

index            134316
WORD             134316
PRONUNCIATION    114157
SYLLABLES            11
SCANSION            287
dtype: int64

We can see there are 134316 distinct entries in the database.

It appears we have just under 20,000 words with matching definitions. Finding which pronunciation(s) is most common would be interesting to see.

There are 11 different syllable counts, what words(s) has the highest syllable count?

In [8]:
df.loc[df['SYLLABLES'].idxmax()]

index                                                       117251
WORD                            SUPERCALIFRAGILISTICEXPIALIDOCIOUS
PRONUNCIATION    S UW P ER K AE L AH F R AE JH AH L IH S T IH K...
SYLLABLES                                                       14
SCANSION                                            susupusususupu
Name: 117251, dtype: object

In [9]:
# Do any other words match Supercalifragilisticexpialidocious's syllable count?
# df.dtypes

filter = df['SYLLABLES']
df.loc[filter == 14]

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
117251,117251,SUPERCALIFRAGILISTICEXPIALIDOCIOUS,S UW P ER K AE L AH F R AE JH AH L IH S T IH K...,14,susupusususupu


## Let's start getting the count of various things

In [10]:
word_count = df['WORD'].count()
word_count

134316

In [11]:
unique_starting_characters = set(word[0] for word in df['WORD'])
unique_starting_characters

{"'",
 '3',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z'}

In [12]:
# There's a word that starts with three?!
df.loc[df['WORD'].str.startswith('3')]

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
16,16,3-D,TH R IY D IY,2,ps
17,17,3D,TH R IY D IY,2,ps


That checks out. And look, a trochee (two stressed syllables in a row)!

In [13]:
# Finding the count of each starting letter. 
grouped_by_starting_letter =  df.groupby(df.WORD.str[0])
grouped_by_starting_letter.size()

WORD
'       16
3        2
A     7326
B     9749
C    10776
D     7756
E     4760
F     5240
G     5731
H     6457
I     3406
J     1677
K     4166
L     5524
M     9563
N     3220
O     2985
P     8242
Q      463
R     7311
S    14011
T     5660
U     1805
V     2341
W     4400
X       80
Y      731
Z      918
dtype: int64

In [82]:
grouped_by_syllable_count = df.groupby(df['SYLLABLES'])
grouped_by_syllable_count.size()

SYLLABLES
1     16974
2     61626
3     37259
4     13568
5      3916
6       822
7       131
8        16
9         2
12        1
14        1
dtype: int64

so that's why two-syllable words take longer in the app.

## Analysis of rhyme occurrence

As we can see, there are 11 different syllable counts. For this we'll need a function that finds each unique 1 syllable rhyme, 2 syllable rhyme etc... Then we can use that list to get counts of how many times the rhyme occurs.

Luckily, the app has a way to find these rhymes, finding all of the possibilities will require a few steps.

Tentative steps to take:
- get list of unique syllable counts
- For each number of syllables trying to match, return a DataFrame of all the words with that many, or more, syllables.
- Of those words, see which end with the same rhyme pattern.

In [15]:
# the relevant functions from Poetry and Lyric Helper

# def syllables_to_list(word_object: words) -> list:
#     """convert syllables of a word to a list of syllables to use for matching rhymes"""
#     pronunciation = word_object.PRONUNCIATION.split()
#     return pronunciation


# def syllable_to_match(pronunciation_list: list) -> str:
#     """Parses syllables list to find last syllable"""
#     rhyme = ''
#     i = len(pronunciation_list) - 1
#     while i >= 0:
#         if pronunciation_list[i][0] in ['A', 'E', 'I', 'O', 'U']:
#             rhyme = ' '.join(pronunciation_list[i:])
#             # pronunciation_list = pronunciation_list[:i]
#             break
#         else:
#             i -= 1
#     return rhyme


# def match_syllable(word_object: words, syllable: str, syllable_count_matches: list) -> list:
#     # results = words.query.filter(words.PRONUNCIATION.endswith(syllable), words.WORD != word_object.WORD).all()
#     results = [word for word in syllable_count_matches if
#                word.PRONUNCIATION.endswith(syllable) and word.WORD != word_object.WORD]
#     return  results


# def get_rhyme_dict(word_object: words, syllable_count_matches: list) -> dict:
#     """Given a word, return a dictionary with number of syllables rhymed as the key
#     and matching words as values"""
#     syllable_count = word_object.SYLLABLES
#     pronunciation_list = syllables_to_list(word_object)
#     rhyme = ''
#     results_dict = {}
#     i = 0
#     while i < syllable_count:
#         if i == 0:
#             temp = syllable_to_match(pronunciation_list)
#             rhyme = temp
#             results_dict[i+1] = match_syllable(word_object, rhyme, syllable_count_matches)
#             num_indexes_to_remove = len(temp.split())
#             pronunciation_list = pronunciation_list[:-num_indexes_to_remove]
#             i += 1
#         else:
#             temp = syllable_to_match(pronunciation_list)
#             rhyme = temp + ' ' + rhyme
#             value_list = match_syllable(word_object, rhyme, syllable_count_matches)
#             for word in value_list:
#                 if word in results_dict[i]:
#                     results_dict[i].remove(word)
#                 else:
#                     value_list.append(word)
#             results_dict[i+1] = value_list
#             if len(results_dict[i+1]) == 0:
#                 results_dict.pop(i+1)
#                 break
#             num_indexes_to_remove = len(temp.split())
#             pronunciation_list = pronunciation_list[:-num_indexes_to_remove]
#             i += 1
#     return results_dict


# def get_close_matches_rhyme(word_object: words, syllable_count_matches: list) -> dict:
#     """Given a word, searches the database and returns a dict of word objects where
#     rhyme matches are found at various syllable counts."""
#     rhyme_matches = get_rhyme_dict(word_object, syllable_count_matches)
#     close_matches_rhymes = {}
#     # for num in range(word_object.SYLLABLES): # number of syllables
#     for num in range(len(list(rhyme_matches.keys()))): # number of keys, to avoid out of range when matches weren't found.
#         syllable_match_list = details_list_to_word_list(syllable_count_matches)
#         rhyme_list = details_list_to_word_list(rhyme_matches[num + 1])
#         close_matches_rhymes[num+1] = [word for word in syllable_match_list if word in rhyme_list]
#         if len(close_matches_rhymes[num+1]) == 0:
#             close_matches_rhymes.pop(num+1)
#     return close_matches_rhymes

### First order of business is to get the list of all possible rhymes across the range of syllables to be matched.

In [16]:
syllable_count = sorted(df.SYLLABLES.unique())
syllable_count

[1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 14]

Then create a function to create a DataFrame with only that syllable count or higher.

In [17]:
def get_syllable_match_df(num_of_syllables):
    return df.loc[df['SYLLABLES'] >= num_of_syllables]

get_syllable_match_df(5).head()

Unnamed: 0,index,WORD,PRONUNCIATION,SYLLABLES,SCANSION
26,26,A42128,EY F AO R T UW W AH N T UW EY T,6,pppppp
96,96,ABBATIELLO,AA B AA T IY EH L OW,5,uuupu
114,114,ABBREVIATED,AH B R IY V IY EY T IH D,5,upusu
116,116,ABBREVIATING,AH B R IY V IY EY T IH NG,5,upusu
117,117,ABBREVIATION,AH B R IY V IY EY SH AH N,5,usupu


Actually, we can probably use Pandas groups for matching rhymes & counts down the road.

In [19]:
grouped_by_syllable_count_size = df.groupby(df['SYLLABLES'])
grouped_by_syllable_count_size

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1261b1ff0>

Groups may be superfluous.

** let's return dataframes with >= number of syllables. Then parse the same number of syllables as the selected number. 
- first find all single syllable rhyme patterns.
- then find all second syllable rhyme patterns, instead of looping through like we're trying to find various syllable matches like in the app.
- return all those rhyme patterns to a list, then use that list to find words that str.endswith(pattern)

In [99]:
def get_pronunciation_string(word_row) -> str:
    """Convert word row to just a string of its pronunciation"""
    return word_row['PRONUNCIATION'].item()

def syllables_to_list(word: str) -> list:
    """convert syllables of a word to a list of syllables to use for matching rhymes"""
    pronunciation = word.split()
    return pronunciation

test = df.loc[df['WORD'] == 'BRETHREN']
print(type(test))
# print(test)
pronunciation = test['PRONUNCIATION'].item()
print(pronunciation)
pronunciation = get_pronunciation_string(test)
print(syllables_to_list(pronunciation))

<class 'pandas.core.frame.DataFrame'>
B R EH DH R AH N
['B', 'R', 'EH', 'DH', 'R', 'AH', 'N']


Now we have functions to;
- return a DataFrame with words matching syllable counts we need.
- return a row's pronunciation
- return a that pronunciation as a list.

From here we can write the function to find the rhyme pattern we need.

Then use that pattern to match rows in the DataFrame.

** a potential problem with going the route I am is that if there are any words of higher syllable counts that end with a sound not found in any of the lower syllable count words, the final analysis may not be accurate. Something to think about, thought I may be overthinking it. 

** Nevermind, I will just let the functions take the time to parse the last syllable of higher count words as well, instead of just parsing only words matching the syllable count passed in to syllable_match function.

In [62]:
# since we're not always taking the last syllable, we'll also pass in a number of syllables to grab
# so it will be slightly different than the code commented out above.

def syllable_to_match(pronunciation_list: list, num_of_syllables: int) -> str:
    """Parses a pronunciation list """
    rhyme = ''
    syllables_found = 0
    i = len(pronunciation_list) - 1
    while i >= 0 and syllables_found < num_of_syllables:
        if pronunciation_list[i][0] in ['A', 'E', 'I', 'O', 'U']:
            syllables_found += 1
            i -= 1
            if syllables_found == num_of_syllables:
                rhyme = ' '.join(pronunciation_list[i+1:])
                break
        else:
            i -= 1
    return rhyme

In [113]:
test_word = df.loc[df['WORD'] == 'SUPERCALIFRAGILISTICEXPIALIDOCIOUS']
test_word = get_pronunciation_string(test_word)
test_word = syllables_to_list(test_word)
# print(test_word)

# let's see how it works

print(syllable_to_match(test_word, 14))

UW P ER K AE L AH F R AE JH AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S


Almost all the pieces. Now to write a function that returns a list of rhyme patterns found across syllable counts.

In [87]:
syllable_count_list = list(grouped_by_syllable_count.groups.keys())
print(syllable_count_list)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 14]


In [122]:
def get_list_of_rhyme_patterns(dataframe, syllable_cnt_list) -> set:
    output = set()
    for count in syllable_cnt_list:
        syl_df = get_syllable_match_df(count)
        for i in range(len(syl_df)):
            row = syl_df.iloc[[i]]
            pronunciation = get_pronunciation_string(row)
            pronunciation = syllables_to_list(pronunciation)
            rhyme_pattern = syllable_to_match(pronunciation, count)
            output.add(rhyme_pattern)
    return output

test_lst = [9, 14]
test_rhyme_pattern_list = get_list_of_rhyme_patterns(df, test_lst)
print(test_rhyme_pattern_list)

{'EH K S T R AH T EH R AH T AO R IY AE L AH T IY', 'UW P ER K AE L AH F R AE JH AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S', 'AH S T AE B L IH SH M AH N T EH R IY AH N IH Z AH M', 'AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S', 'IY IH N S T IH T UW SH AH N AH L AH Z EY SH AH N'}


Let's make the complete set of rhyme patterns found in our database!

In [125]:
rhyme_patterns = get_list_of_rhyme_patterns(df, syllable_count_list)

In [126]:
print(type(rhyme_patterns))
print(len(rhyme_patterns))

<class 'set'>
96360


96360 different rhyme patterns cumulatively found across the various syllable counts! That's quite high.

If we wanted to see the most common syllable matches for each syllable count, we could return the rhyme_patterns as a dict instead of a set. It may be worthwhile to do that, just for easily analysis later.

In [127]:
def get_list_of_rhyme_patterns(dataframe, syllable_cnt_list) -> dict:
    output = {}
    for count in syllable_cnt_list:
        value_set = set()
        syl_df = get_syllable_match_df(count)
        for i in range(len(syl_df)):
            row = syl_df.iloc[[i]]
            pronunciation = get_pronunciation_string(row)
            pronunciation = syllables_to_list(pronunciation)
            rhyme_pattern = syllable_to_match(pronunciation, count)
            value_set.add(rhyme_pattern)
        output[count] = sorted(value_set)
    return output

test_lst = [9, 14]
test_rhyme_pattern_list = get_list_of_rhyme_patterns(df, test_lst)
print(test_rhyme_pattern_list)

{9: ['AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S', 'AH S T AE B L IH SH M AH N T EH R IY AH N IH Z AH M', 'EH K S T R AH T EH R AH T AO R IY AE L AH T IY', 'IY IH N S T IH T UW SH AH N AH L AH Z EY SH AH N'], 14: ['UW P ER K AE L AH F R AE JH AH L IH S T IH K EH K S P IY AE L AH D OW SH AH S']}


That looks much better! And it will make it easier to analyze later.

In [128]:
rhyme_patterns = get_list_of_rhyme_patterns(df, syllable_count_list)

In [132]:
print(rhyme_patterns.keys())
sum(len(lst) for dct in rhyme_patterns.values() for lst in dct.values())
se

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 14])


AttributeError: 'list' object has no attribute 'values'