In [None]:
import os
import re
import time
import requests
import itertools
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer

# Functions 

Get rhymes from datamuse api

In [None]:
def get_rhymes(word):
    
    url = f"https://api.datamuse.com/words?rel_rhy={word.lower()}"
    
    response = requests.get(url)
    
    return response.json()

Iterate through the rhymes response to create 400000 rhyming pairs.

In [None]:
rhyme_data = []
rhyme_id = 1
rhyme_group_id = 1
with open("en.dict","r") as dict_file:
    while True:
        file_contents=dict_file.readline().strip()
        if file_contents=="":
            break
        rhyme_response = get_rhymes(file_contents)
        if len(rhyme_response) > 0:            
            rhyming_words = [rhyme['word'] for rhyme in rhyme_response] + [file_contents]
            all_rhyme_combinations = list(itertools.combinations(rhyming_words, 2))
            # create an entry for all possible rhyme pairs returned
            for rhyme_pair in all_rhyme_combinations:
                rhyme_data.append(
                    {
                            'rhyme_id': rhyme_id,
                            'rhyme_group_id': rhyme_group_id,
                            'word_a': rhyme_pair[0],
                            'word_b': rhyme_pair[1],
                            'rhyme': 1
                    }
                )
                rhyme_id+=1
            rhyme_group_id+=1        
        if rhyme_group_id % 10 == 0 and rhyme_group_id != 0:
            rhyme_df = pd.DataFrame(rhyme_data)
            rhyme_df.to_pickle('data/rhymes/rhyme_df.pkl')
        if len(rhyme_data)>400000:
          break

# convert to dataframe
rhyme_df = pd.DataFrame(rhyme_data)
rhyme_df = rhyme_df.drop_duplicates(subset=['word_a', 'word_b'], keep='first')
rhyme_df.to_csv('data/rhymes/rhyme_df.csv', index=False)

In [None]:
rhyme_df=pd.read_pickle("data/rhymes/rhyme_df.pkl")
rhyme_df = pd.DataFrame(rhyme_data)
rhyme_df = rhyme_df.drop_duplicates(subset=['word_a', 'word_b'], keep='first')
rhyme_df.to_csv('data/rhymes/rhyme_df.csv', index=False)

In [None]:
rhyme_df = pd.read_csv('data/rhymes/rhyme_df.csv')
rhyme_df.loc[rhyme_df['rhyme_group_id']==261].sample(5).reset_index(drop=True)

Repeat it for non-rhyming words, 400000 pairs. Picking a random word pair, there is a very little chance to create a rhyme there, however we think that the chance is so low, that it shouldn't matter

In [None]:
non_rhyme_df = rhyme_df.copy()
for rhyme_group in list(rhyme_df['rhyme_group_id'].drop_duplicates()):
    
    words_in_group = len(rhyme_df.loc[rhyme_df['rhyme_group_id'] == rhyme_group])
    
    other_rhyme_samples = list(
        non_rhyme_df.loc[non_rhyme_df['rhyme_group_id'] != rhyme_group, 'word_b'].sample(words_in_group)
    )
    
    non_rhyme_df.loc[non_rhyme_df['rhyme_group_id'] == rhyme_group, 'word_b'] = other_rhyme_samples
    
non_rhyme_df['rhyme'] = 0
non_rhyme_df = non_rhyme_df.drop_duplicates(subset=['word_a', 'word_b'], keep='first')
non_rhyme_df.to_csv('data/rhymes/non_rhyme_df.csv', index=False)
print('Done!')