In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Subtract
from tensorflow.keras.callbacks import ModelCheckpoint, TerminateOnNaN, CSVLogger
tqdm.pandas()

MAX_LEN = 64

# Functions

In [None]:
def tokenize_inputs(phrase_a, phrase_b, tokenizer):

    tokenized_phrases = tokenizer.texts_to_sequences([phrase_a, phrase_b])
    tokenized_phrases_for_output = []
    for phrase in tokenized_phrases:
        if len(phrase) < MAX_LEN:
            length_to_pad = MAX_LEN - len(phrase)
            phrase_for_output = ([0] * length_to_pad) + phrase
        elif len(phrase) > MAX_LEN:
            phrase_for_output = phrase[-MAX_LEN:]
        else:
            phrase_for_output = phrase
        tokenized_phrases_for_output.append(phrase_for_output)

    return tf.constant(tokenized_phrases_for_output, dtype=tf.float64)

In [None]:
def create_model():
  
    word_a_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_a_input_tokens'
      )
    word_b_input_tokens = Input(
      shape=(MAX_LEN, 1), name='word_b_input_tokens'
      )

    common_lstm = LSTM(64, return_sequences=False, activation="relu", name="common_lstm_layer")

    word_a_lstm_output = common_lstm(word_a_input_tokens)
    word_b_lstm_output = common_lstm(word_b_input_tokens)

    #concatenate_lstm_outputs
    concat_layer = Subtract(name="concatenate_lastm_outputs")(
      [word_a_lstm_output, word_b_lstm_output]
      )
    
    # dense layers before final classification
    dense_layers = Dense(64, activation="relu", name="first_dense_layer")(concat_layer)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(32, activation="relu", name="second_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    dense_layers = Dense(8, activation="relu", name="third_dense_layer")(dense_layers)
    dense_layers = Dropout(0.5)(dense_layers)

    classification_layer = Dense(1, activation="sigmoid", name="classification_layer")(dense_layers)
    
    model = Model(
      inputs=[word_a_input_tokens, word_b_input_tokens], 
      outputs = classification_layer
      )

    model.compile(
      loss="binary_crossentropy",
      metrics=["accuracy"],
      optimizer="Adam"
    )

    return model

# Import data 

Load the data from the rhyme and non-rhyme files we created previously.

In [None]:
rhyme_df = pd.read_csv('data/rhymes/rhyme_df.csv')
rhyme_df = rhyme_df.dropna(subset=['word_a', 'word_b', 'rhyme'])
non_rhyme_df = pd.read_csv('data/rhymes/non_rhyme_df.csv')
non_rhyme_df = non_rhyme_df.dropna(subset=['word_a', 'word_b', 'rhyme'])

df = pd.concat([
        rhyme_df.sample(400_000, random_state=123), 
        non_rhyme_df.sample(400_000, random_state=123)
    ])
del rhyme_df, non_rhyme_df

df.head()

# Tokenize inputs

Create and fit a tokenizer on the dataset.

In [None]:
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(df['word_a'] + df['word_b'])

df['word_tokens'] = df.apply(lambda row: tokenize_inputs(row['word_a'], row['word_b'], tokenizer), axis=1)

# Split data into train (60%) test (30%) and validation (10%) sets

Use stratified random sampling to create train, test and validation datasets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    list(df['word_tokens']), list(df['rhyme']), stratify=df['rhyme'], 
    test_size=0.4, random_state=123
    )
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, stratify=y_test, test_size=0.25, random_state=123
    )

In [None]:
X_train = tf.convert_to_tensor(X_train)
y_train = tf.convert_to_tensor(y_train)
X_val = tf.convert_to_tensor(X_val)
y_val = tf.convert_to_tensor(y_val)

# Train model

Call our model function and fit it on our training data

In [None]:
model = create_model()

model_checkpoint = ModelCheckpoint("models/rhyme_model.hdf5",monitor="val_loss")
terminate_on_nan = TerminateOnNaN()
csv_logger = CSVLogger('training.log')

history = model.fit(
    [X_train[:, 0], X_train[:, 1]],
    y_train,
    batch_size=128,
    epochs=100,
    callbacks=[model_checkpoint, terminate_on_nan, csv_logger],
    validation_data=([X_val[:, 0], X_val[:, 1]], y_val)
)


# Test model

In [None]:
# load the model
model = load_model("models/rhyme_model.hdf5")

X_test = tf.convert_to_tensor(X_test)
y_test = tf.convert_to_tensor(y_test)

y_pred = model.predict([X_test[:, 0], X_test[:, 1]])
y_pred = y_pred > 0.5


In [1]:
print(classification_report(y_test, y_pred))

NameError: name 'classification_report' is not defined

# Run model on some examples 

In [None]:
samples = [
   ["soap", "hope"],
   ["cat", "hat"],
   ["sliver", "cleaver"]
   ["inspire", "desire"],
   ["tomato", "salad"],
]

sample_tokens = [tokenize_inputs(lyrics[0], lyrics[1], tokenizer) for lyrics in samples]
sample_tokens = tf.convert_to_tensor(sample_tokens)
sample_pred = model.predict([sample_tokens[:, 0], sample_tokens[:, 1]])
predictions = [round(pred[0], 4) for pred in sample_pred]
for i in range(len(samples)):
    print(f"Word 1: {samples[i][0]}")
    print(f"Word 2: {samples[i][1]}")
    print(f"{'Rhyme' if predictions[i] > 0.5 else 'Non-rhyme'}({predictions[i]})")
    print("---------------\n")