In [10]:
# import numpy as np
import pandas as pd
# from tqdm.notebook import tqdm
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
# from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Subtract
# from tensorflow.keras.callbacks import ModelCheckpoint, TerminateOnNaN, CSVLogger
# tqdm.pandas()

MAX_LEN = 64

In [11]:
def tokenize_inputs(phrase_a, phrase_b, tokenizer):

    tokenized_phrases = tokenizer.texts_to_sequences([phrase_a, phrase_b])

    # now loop through inputs and pad or reduce size if required
    tokenized_phrases_for_output = []
    for phrase in tokenized_phrases:
        if len(phrase) < MAX_LEN:
            length_to_pad = MAX_LEN - len(phrase)
            phrase_for_output = ([0] * length_to_pad) + phrase
        elif len(phrase) > MAX_LEN:
            phrase_for_output = phrase[-MAX_LEN:]
        else:
            phrase_for_output = phrase
        tokenized_phrases_for_output.append(phrase_for_output)

    return tf.constant(tokenized_phrases_for_output, dtype=tf.float64)

rhyme_df = pd.read_csv('data/rhymes/rhyme_df.csv')
rhyme_df = rhyme_df.dropna(subset=['word_a', 'word_b', 'rhyme'])
non_rhyme_df = pd.read_csv('data/rhymes/non_rhyme_df.csv')
non_rhyme_df = non_rhyme_df.dropna(subset=['word_a', 'word_b', 'rhyme'])

df = pd.concat([
        rhyme_df.sample(400_000, random_state=123), 
        non_rhyme_df.sample(400_000, random_state=123)
    ])
del rhyme_df, non_rhyme_df

tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(df['word_a'] + df['word_b'])

model = load_model("models/rhyme_model.hdf5")

In [12]:
samples = [
   ["soap", "hope"],
   ["cat", "hat"], 
   ["sliver", "cleaver"],
   ["inspire", "desire"],
   ["tomato", "salad"],
]

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]
for i in range(len(samples)):
    print(f"Lyric 1: {samples[i][0]}")
    print(f"Lyric 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")

Lyric 1: soap
Lyric 2: hope
Rhyme(0.6202999949455261)
---------------

Lyric 1: cat
Lyric 2: hat
Rhyme(0.9830999970436096)
---------------

Lyric 1: sliver
Lyric 2: cleaver
Rhyme(0.9830999970436096)
---------------

Lyric 1: inspire
Lyric 2: desire
Rhyme(0.9830999970436096)
---------------

Lyric 1: tomato
Lyric 2: salad
Non-rhyme(0.0003000000142492354)
---------------

