In [7]:
import pandas as pd
import numpy as np
 
# TensorFlow
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate
from tensorflow.keras.layers import Dropout, Dense, Lambda, Multiply, Subtract, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Activation, Reshape
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam


# Scikit-learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Text preprocessing
from nltk.tokenize import word_tokenize
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Misc.
import os
import joblib
import random
import time
from tqdm import tqdm_notebook as tqdm
import pretty_midi

SEED = 42
%matplotlib inline

In [8]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/liavba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/liavba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/liavba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
cols = ['Singer', 'Song Name', 'Lyrics']
df_train = pd.read_csv('datasets/lyrics_train_set.csv', names=cols)

# df_test = pd.read_csv('datasets/lyrics_test_set.csv', names=cols)

In [11]:
pm = pretty_midi.PrettyMIDI('datasets/midi_files/aladdin_-_A_whole_new_world.mid')
pm

<pretty_midi.pretty_midi.PrettyMIDI at 0x7fdf0fc7ce10>

In [12]:
print('There are {} time signature changes'.format(len(pm.time_signature_changes)))
print('There are {} instruments'.format(len(pm.instruments)))
print('Instrument 3 has {} notes'.format(len(pm.instruments[0].notes)))
print('Instrument 4 has {} pitch bends'.format(len(pm.instruments[4].pitch_bends)))
print('Instrument 5 has {} control changes'.format(len(pm.instruments[5].control_changes)))

There are 1 time signature changes
There are 9 instruments
Instrument 3 has 227 notes
Instrument 4 has 0 pitch bends
Instrument 5 has 0 control changes


In [13]:
l = df_train.iloc[0, 2]
l

'goodbye norma jean & though i never knew you at all & you had the grace to hold yourself & while those around you crawled & they crawled out of the woodwork & and they whispered into your brain & they set you on the treadmill & and they made you change your name & and it seems to me you lived your life & like a candle in the wind & never knowing who to cling to & when the rain set in & and i would liked to have known you & but i was just a kid & your candle burned out long before & your legend ever did & loneliness was tough & the toughest role you ever played & hollywood created a superstar & and pain was the price you paid & even when you died & oh the press still hounded you & all the papers had to say & was that marilyn was found in the nude & and it seems to me you lived your life & like a candle in the wind & never knowing who to cling to & when the rain set in & and i would liked to have known you & but i was just a kid & your candle burned out long before & your legend ever di

In [57]:
from nltk.tokenize import word_tokenize

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"in\'", "ing", phrase)
    phrase = re.sub(r"y\'all", "you all", phrase)
    
    # punctions
    regex = re.compile('[^a-zA-Z& ]')
    phrase = regex.sub('', phrase)
    
    return phrase

def preprocess_lyrics(data):
    data = decontracted(data)
    tokens = word_tokenize(data)
    data_arr = []
    
    for t in tokens:
        # Use only words, character combinations and numbers 
#         if not t.isalpha(): 
#             continue
            
        # Lower case word
        t = t.lower()
        
#         # Remove stop words
#         if t in sw: 
#             continue
        
        data_arr.append(t)
    
    
    return data_arr

In [15]:
df_train.iloc[8,2]

"[chorus:]   & oh i'm bein' followed by a moonshadow moon shadow moonshadow---   & leapin and hoppin' on a moonshadow moonshadow moonshadow---   &    & and if i ever lose my hands lose my plough lose my land   & oh if i ever lose my hands oh if i won't have to work no more.   &    & and if i ever lose my eyes if my colours all run dry   & yes if i ever lose my eyes oh if i won't have to cry no more.   &    & [chorus]   &    & and if i ever lose my legs i won't moan and i won't beg   & yes if i ever lose my legs oh if i won't have to walk no more.   &    & and if i ever lose my mouth all my teeth north and south   & yes if i ever lose my mouth oh if i won't have to talk...   &    & did it take long to find me? i asked the faithful light.   & did it take long to find me? and are you gonna stay the night?   &    & [chorus]   & moonshadow moonshadow moonshadow moonshadow. &"

In [86]:
string = df_train.iloc[8,2]
tokenized_string = preprocess_lyrics(string)

def pretty_lyrics(tokenized_string):
    for token in tokenized_string:
        if token == '&':
            print('\n')
        else:
            print(token, end=' ')

pretty_lyrics(tokenized_string)

chorus 

oh i am being followed by a moonshadow moon shadow moonshadow 

leapin and hopping on a moonshadow moonshadow moonshadow 



and if i ever lose my hands lose my plough lose my land 

oh if i ever lose my hands oh if i will not have to work no more 



and if i ever lose my eyes if my colours all run dry 

yes if i ever lose my eyes oh if i will not have to cry no more 



chorus 



and if i ever lose my legs i will not moan and i will not beg 

yes if i ever lose my legs oh if i will not have to walk no more 



and if i ever lose my mouth all my teeth north and south 

yes if i ever lose my mouth oh if i will not have to talk 



did it take long to find me i asked the faithful light 

did it take long to find me and are you gon na stay the night 



chorus 

moonshadow moonshadow moonshadow moonshadow 



In [47]:
lyrics = df_train['Lyrics'].apply(lambda s: preprocess_lyrics(s)[:-1] + ['$'])


b. Create embeddings

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lyrics)

In [19]:
lyrics = tokenizer.texts_to_sequences(lyrics)

In [20]:
EMBEDDING_FILE = './GoogleNews-vectors-negative300.bin'

if not os.path.isfile(EMBEDDING_FILE):
    !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
    !gzip -f -d GoogleNews-vectors-negative300.bin.gz

In [21]:
from gensim import models

embeddings_index = models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
embed_size = 300
word_index = tokenizer.word_index
max_features = len(word_index) + 1

nb_words = len(word_index)
embedding_matrix = (np.random.rand(nb_words+1, embed_size) - 0.5) / 5.0

not_in_word2vec = 0
for word, i in word_index.items():
    if i >= max_features: continue
    if word in embeddings_index:
        embedding_vector = embeddings_index.get_vector(word)
        embedding_matrix[i] = embedding_vector
    else:
        not_in_word2vec += 1
        
print(f'{not_in_word2vec} out of {len(word_index)} has no embedings from word2vec')

634 out of 7286 has no embedings from word2vec


### Trying one word to whole song but one word

In [22]:
train_x, train_y = [], []
for lyric in lyrics:
    for i in range(1, len(lyric)):
        train_x.append(lyric[:i])
        train_y.append(*lyric[i:i+1])
        
train_x = pad_sequences(train_x)
train_y = to_categorical(train_y)
train_x.shape, train_y.shape

((191915, 1577), (191915, 7287))

### Trying sliding window of words

In [23]:
ast=np.lib.index_tricks.as_strided
def generate_sliding_window(arr, window_size=5, window_stride=1, last_window=False):
    last_window = 1 if last_window else 0
    arr = np.ascontiguousarray(arr)
    arr_len = arr.shape[0]
    s, = arr.strides
    windows_num = ((arr_len-window_size)//window_stride) + last_window
    
    return ast(arr, (windows_num, window_size), (s*window_stride, s))

In [24]:
train_x, train_y = [], []
window_size = 10

for lyric in lyrics:
    train_x.append(generate_sliding_window(lyric, window_size))
    train_y.append(lyric[window_size:])
        
train_x = np.concatenate(train_x)
train_y = to_categorical(np.concatenate(train_y))
train_x.shape, train_y.shape

((186380, 10), (186380, 7287))

# building the model

In [41]:
seq_len = train_x.shape[1]

def init_simple():
    inp = Input(shape=(seq_len,))
    
    embd = Embedding(max_features, 
                      embed_size, 
                      weights=[embedding_matrix],
                      input_length=seq_len,
                      name='word_embd')(inp)
    
    lstm = LSTM(100, return_sequences=True)(embd)
    lstm = LSTM(100)(lstm)

    X = Dense(100, activation="relu")(lstm)
    X = Dropout(0.5)(X)
    out = Dense(max_features, activation="softmax", name = 'out')(X)

    model = Model(inp, out)
    
#     model.get_layer('embd').trainable = False

    model.compile(loss='categorical_crossentropy', optimizer=Adam())
    
    return model

In [42]:
def get_callbacks(model_name):
    acc = 'val_loss'
    acc_mode = 'min'
#     acc = 'val_acc'
#     acc_mode = 'max'
    
    checkpoint = ModelCheckpoint(
                              fr'./models/{model_name}.h5', 
                              monitor=acc, 
#                               verbose=1, 
                              save_best_only=True, 
                              mode=acc_mode)
    earlystop = EarlyStopping(monitor=acc, mode=acc_mode, verbose=0, patience=6)
    reduceLR = ReduceLROnPlateau(monitor = 'val_loss', mode = 'min', patience = 5,
                            factor = 0.5, min_lr = 1e-6, verbose = 0)

    return [checkpoint, reduceLR] #earlystop

In [43]:

def train_model(model, train_x, train_y, use_saved=False, params_dict=None):
    os.makedirs('./models', exist_ok=True)
    
    params = ''
    if params_dict is not None:
        params = '_'.join(f'{key}_{val}' for key,val in params_dict.items())
    model_name = 'simple_model' + f'_{params}'
    
    
    if use_saved:
        history = joblib.load(fr'./models/{model_name}_history.sav')
    else:
        callbacks = get_callbacks(model_name)
        history = model.fit(
                            x=train_x,
                            y=train_y,
                            batch_size=params_dict['batch_size'],
                            epochs=params_dict['epochs'],
                            validation_split=params_dict['validation_split'],
                            callbacks=callbacks,
                            verbose=1
                            )
        
        history = history.history
        joblib.dump(history, fr'./models/{model_name}_history.sav')
    
    model = load_model(fr'./models/{model_name}.h5')
    
    return model, history

In [44]:
model = init_simple()
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
word_embd (Embedding)        (None, 10, 300)           2186100   
_________________________________________________________________
lstm_3 (LSTM)                (None, 10, 100)           160400    
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
out (Dense)                  (None, 7287)              7359

In [46]:
params_dict = {'batch_size': 32, 'epochs': 20, 'validation_split': 0.3}
model, history= train_model(model, train_x, train_y, use_saved=True, params_dict=params_dict)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


(<tensorflow.python.keras.engine.functional.Functional at 0x7fd82849dc90>,
 {'loss': [4.04681396484375,
   3.8957934379577637,
   3.766421318054199,
   3.6478662490844727,
   3.5469417572021484,
   3.4546384811401367,
   3.275069236755371,
   3.1757025718688965,
   3.1060564517974854,
   3.047346591949463,
   2.9901671409606934,
   2.886751890182495,
   2.831249475479126,
   2.7937979698181152,
   2.7621686458587646,
   2.727421283721924,
   2.670431613922119,
   2.6432433128356934,
   2.6246190071105957,
   2.602755069732666],
  'val_loss': [5.68879508972168,
   5.854903221130371,
   6.051053524017334,
   6.230398654937744,
   6.38828706741333,
   6.7080078125,
   7.040964126586914,
   7.41554594039917,
   7.668073654174805,
   8.089550018310547,
   8.245061874389648,
   8.77805233001709,
   8.925065994262695,
   9.131375312805176,
   9.445945739746094,
   9.541833877563477,
   9.882682800292969,
   10.099723815917969,
   10.402018547058105,
   10.391733169555664],
  'lr': [0.001,
   

In [92]:
 

def generate_song(model, seed, window_size, stop_token, tokenizer, max_len):
    stop_token = tokenizer.word_index[stop_token]
    
    
    def get_next_word(seed):
        probs = model.predict(seed)
        chosen_idx = np.random.choice(range(0, max_features), p=probs[0])
        chosen_word = tokenizer.sequences_to_texts([[chosen_idx]])[0]
        
        return chosen_idx, chosen_word
    
    
    seed = preprocess_lyrics(seed)
    song = seed.copy()
    seed = " ".join(seed)
    seed = tokenizer.texts_to_sequences([seed])
    seed = pad_sequences(seed, maxlen=window_size)

    i = 0
    
    idx, word = get_next_word(seed)
    
    
    while idx != stop_token and i < max_len:
        song.append(word)
        i+=1
        seed = np.concatenate([seed[:,1:], [[idx]]], axis=1)
        idx, word = get_next_word(seed)
    
    return song    
    


In [100]:

song = generate_song(model, 'goodbye guy shani my love', 10, '$', tokenizer, 1000)
pretty_lyrics(song)

goodbye guy shani my love away 

oh i feel now about 

he the same 

did he know waiting back a working times 

everybody says you used to live 

want to show her it is not something 

then i thrill let him find rolling and knew his alone or our way 

will we love somebody looking because 

and i am sorry oh and love 

when i want to be with you 

i feel you try again like we feel 

was what we said then place two and all 

i am watching in the big pretty who wants you too 

on the toes in in mine 

in getting else of a fancy world 

i never find my genie in the man if i want you what could not make her a shubop 

will you make me so good back this love 

to time about to stop 

bring me a game down and see your scales and pullin 

you see a could call my twist eyes 

In [82]:
seed = preprocess_lyrics('close your eyes')
seed = " ".join(seed)
seed = tokenizer.texts_to_sequences([seed])
seed = pad_sequences(seed, maxlen=window_size)

seed

array([[  0,   0,   0,   0,   0,   0,   0, 306,  17, 105]], dtype=int32)

In [83]:
np.concatenate([seed[:,1:], [[word]]], axis=1)


array([['0', '0', '0', '0', '0', '0', '306', '17', '105', 'cabbage']],
      dtype='<U11')