<a href="https://colab.research.google.com/github/GuidoGiacomoMussini/Text_Mining-Lyrics_Analysis/blob/main/4_Rhymes_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

In [None]:
from google.colab import drive
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re
import string
import itertools
import pickle
import random
from tqdm import tqdm as progress_bar
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import the file with the rhymes
files_path = '/content/drive/MyDrive/Colab Notebooks/Text Mining/Files/'
with open(files_path+'rhymes_list.pickle', 'rb') as f:
    rhymes = pickle.load(f)
with open(files_path+'rhymes_idx.pickle', 'rb') as f:
    idx = pickle.load(f)

Create the dataset

In [None]:
# aggregates all the words that rhyme
df_grp = pd.DataFrame()
for i in range(len(rhymes)):
  df = pd.DataFrame({"x": rhymes[i], "idx": idx[i]})
  df_grp = pd.concat([df_grp, df.groupby('idx')['x'].apply(' '.join).reset_index()], ignore_index = True)

df_train = df_grp.drop(columns = ['idx'])


#structure the dataset: word1 - word2 - label with all the possible combination of words that rhyme

#reverse the pairs too double the observations:  w1 - w2  -> w2 - w1
coppie, coppie_rev = [], []
#create pairs of words that rhymes
for parole in df_train['x']:
    parole_lista = parole.split()
    coppie.extend([(coppia[0], coppia[1]) for coppia in itertools.combinations(parole_lista, 2)])
    coppie_rev.extend([(coppia[1], coppia[0]) for coppia in itertools.combinations(parole_lista, 2)])

#concat df and reversed df
df_train = pd.DataFrame(coppie, columns=['x1', 'x2'])
df_rev = pd.DataFrame(coppie_rev, columns=['x1', 'x2'])
df_train = pd.concat([df_train, df_rev], ignore_index = True).drop_duplicates(subset = ['x1', 'x2'])
df_train['label'] = 0 #0 indicate that 2 words rhyme

#create example of words that don't rhymes
all_words = list(set(df_train.x1))
coppie = []
for i in range(len(df_train)*2):
    coppie.extend([(random.choice(all_words), random.choice(all_words)) for coppia in itertools.combinations(parole_lista, 2)])

df_not_rhymes = pd.DataFrame(coppie, columns=['x1', 'x2'])
df_not_rhymes['label'] = 1


#concat the dataframes
df_train = pd.concat([df_train, df_not_rhymes], ignore_index = True)
df_train.sample(5)

In [None]:
def word_to_features(word, num_features):
  '''
  create numerical index to indicate the last n (num features) letters
  '''
  #extract the lasts n letters, if a word is shorter, than apply padding value
  padding_value = 0
  features = [ord(char) for char in word[-num_features:]] if len(word) >= num_features else [padding_value] * (num_features - len(word)) + [ord(char) for char in word]

  return features

In [None]:
#Extract the last letters of each word for the model
num_features = 3
X1_features = np.array([word_to_features(word, num_features) for word in df_train.x1])
X2_features = np.array([word_to_features(word, num_features) for word in df_train.x2])
y = df_train.label.values

# train val test split
X1_train, X1_temp, X2_train, X2_temp, y_train, y_temp = train_test_split(X1_features, X2_features, y, test_size=0.2)
X1_val, X1_test, X2_val, X2_test, y_val, y_test = train_test_split(X1_temp, X2_temp, y_temp, test_size=0.5)

Model

In [None]:
input_x1 = Input(shape=(num_features,))
input_x2 = Input(shape=(num_features,))
concatenated = concatenate([input_x1, input_x2])

dense1 = Dense(256, activation='relu')(concatenated)
output = Dense(1, activation='sigmoid')(dense1)
model = Model(inputs=[input_x1, input_x2], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)

In [None]:
model.fit([X1_train, X2_train], y_train, epochs=150, batch_size=64, validation_data=([X1_val, X2_val], y_val), callbacks=[early_stopping])

In [None]:
# Valutazione del modello sul test set
print("TESTING")
test_loss, test_accuracy = model.evaluate([X1_test, X2_test], y_test)
print(f"Test Accuracy: {test_accuracy}")

TESTING
Test Accuracy: 0.9050472974777222


In [None]:
model.save('/content/drive/MyDrive/Colab Notebooks/Text Mining/Models/Rhyme_detector', save_format='tf')