In [1]:
import pandas as pd
import numpy as np
 
# TensorFlow
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate
from tensorflow.keras.layers import Dropout, Dense, Lambda, Multiply, Subtract, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Activation, Reshape
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam


# Scikit-learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Text preprocessing
from nltk.tokenize import word_tokenize
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Misc.
import os
import joblib
import random
import time
from tqdm import tqdm_notebook as tqdm
import pretty_midi

SEED = 42
%matplotlib inline

In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/naorko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/naorko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/naorko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
cols = ['Singer', 'Song Name', 'Lyrics']
df_train = pd.read_csv('datasets/lyrics_train_set.csv', names=cols)
# df_test = pd.read_csv('datasets/lyrics_test_set.csv', names=cols)

In [4]:
df_train

Unnamed: 0,Singer,Song Name,Lyrics
0,elton john,candle in the wind,goodbye norma jean & though i never knew you a...
1,gerry rafferty,baker street,winding your way down on baker street & lite i...
2,gerry rafferty,right down the line,you know i need your love & you've got that ho...
3,2 unlimited,tribal dance,come on check it out ya'll & (come on come on!...
4,2 unlimited,let the beat control your body,let the beat control your body & let the beat ...
...,...,...,...
610,don henley,dirty laundry,i make my living off the evening news & just g...
611,don henley,new york minute,harry got up & dressed all in black & went dow...
612,bob dylan,subterranean homesick blues,johnny's in the basement & mixing up the medic...
613,goldfinger,mable,i met her sunday that was yesterday & the girl...


In [5]:
df_test

NameError: name 'df_test' is not defined

In [5]:
pm = pretty_midi.PrettyMIDI('datasets/midi_files/aladdin_-_A_whole_new_world.mid')
pm

<pretty_midi.pretty_midi.PrettyMIDI at 0x7fca1165ce80>

In [6]:
print('There are {} time signature changes'.format(len(pm.time_signature_changes)))
print('There are {} instruments'.format(len(pm.instruments)))
print('Instrument 3 has {} notes'.format(len(pm.instruments[0].notes)))
print('Instrument 4 has {} pitch bends'.format(len(pm.instruments[4].pitch_bends)))
print('Instrument 5 has {} control changes'.format(len(pm.instruments[5].control_changes)))

There are 1 time signature changes
There are 9 instruments
Instrument 3 has 227 notes
Instrument 4 has 0 pitch bends
Instrument 5 has 0 control changes


In [8]:
l = df_train.iloc[0, 2]
l

'goodbye norma jean & though i never knew you at all & you had the grace to hold yourself & while those around you crawled & they crawled out of the woodwork & and they whispered into your brain & they set you on the treadmill & and they made you change your name & and it seems to me you lived your life & like a candle in the wind & never knowing who to cling to & when the rain set in & and i would liked to have known you & but i was just a kid & your candle burned out long before & your legend ever did & loneliness was tough & the toughest role you ever played & hollywood created a superstar & and pain was the price you paid & even when you died & oh the press still hounded you & all the papers had to say & was that marilyn was found in the nude & and it seems to me you lived your life & like a candle in the wind & never knowing who to cling to & when the rain set in & and i would liked to have known you & but i was just a kid & your candle burned out long before & your legend ever di

In [4]:
from nltk.tokenize import word_tokenize

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"in\'", "ing", phrase)
    phrase = re.sub(r"y\'all", "you all", phrase)
    
    # punctions
    regex = re.compile('[^a-zA-Z& ]')
    phrase = regex.sub('', phrase)
    
    return phrase

def preprocess_lyrics(data):
    data = decontracted(data)
    tokens = word_tokenize(data)
    data_arr = []
    
    for t in tokens:
        # Use only words, character combinations and numbers 
#         if not t.isalpha(): 
#             continue
            
        # Lower case word
        t = t.lower()
        
#         # Remove stop words
#         if t in sw: 
#             continue
        
        data_arr.append(t)
    
    return data_arr

In [7]:
df_train.iloc[8,2]

"[chorus:]   & oh i'm bein' followed by a moonshadow moon shadow moonshadow---   & leapin and hoppin' on a moonshadow moonshadow moonshadow---   &    & and if i ever lose my hands lose my plough lose my land   & oh if i ever lose my hands oh if i won't have to work no more.   &    & and if i ever lose my eyes if my colours all run dry   & yes if i ever lose my eyes oh if i won't have to cry no more.   &    & [chorus]   &    & and if i ever lose my legs i won't moan and i won't beg   & yes if i ever lose my legs oh if i won't have to walk no more.   &    & and if i ever lose my mouth all my teeth north and south   & yes if i ever lose my mouth oh if i won't have to talk...   &    & did it take long to find me? i asked the faithful light.   & did it take long to find me? and are you gonna stay the night?   &    & [chorus]   & moonshadow moonshadow moonshadow moonshadow. &"

In [8]:
string = df_train.iloc[8,2]
tokenized_string = preprocess_lyrics(string)

for token in tokenized_string:
    if token == '&':
        print('\n')
    else:
        print(token, end=' ')

chorus 

oh i am being followed by a moonshadow moon shadow moonshadow 

leapin and hopping on a moonshadow moonshadow moonshadow 



and if i ever lose my hands lose my plough lose my land 

oh if i ever lose my hands oh if i will not have to work no more 



and if i ever lose my eyes if my colours all run dry 

yes if i ever lose my eyes oh if i will not have to cry no more 



chorus 



and if i ever lose my legs i will not moan and i will not beg 

yes if i ever lose my legs oh if i will not have to walk no more 



and if i ever lose my mouth all my teeth north and south 

yes if i ever lose my mouth oh if i will not have to talk 



did it take long to find me i asked the faithful light 

did it take long to find me and are you gon na stay the night 



chorus 

moonshadow moonshadow moonshadow moonshadow 



In [5]:
lyrics = df_train['Lyrics'].apply(lambda s: preprocess_lyrics(s))
lyrics

0      [goodbye, norma, jean, &, though, i, never, kn...
1      [winding, your, way, down, on, baker, street, ...
2      [you, know, i, need, your, love, &, you, have,...
3      [come, on, check, it, out, ya, will, &, come, ...
4      [let, the, beat, control, your, body, &, let, ...
                             ...                        
610    [i, make, my, living, off, the, evening, news,...
611    [harry, got, up, &, dressed, all, in, black, &...
612    [johnny, is, in, the, basement, &, mixing, up,...
613    [i, met, her, sunday, that, was, yesterday, &,...
614    [you, all, know, me, still, the, same, og, but...
Name: Lyrics, Length: 615, dtype: object

b. Create embeddings

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lyrics)

In [7]:
lyrics = tokenizer.texts_to_sequences(lyrics)

In [8]:
EMBEDDING_FILE = './GoogleNews-vectors-negative300.bin'

if not os.path.isfile(EMBEDDING_FILE):
    !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
    !gzip -f -d GoogleNews-vectors-negative300.bin.gz

In [9]:
from gensim import models

embeddings_index = models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
embed_size = 300
word_index = tokenizer.word_index
max_features = len(word_index) + 1

nb_words = len(word_index)
embedding_matrix = (np.random.rand(nb_words+1, embed_size) - 0.5) / 5.0

not_in_word2vec = 0
for word, i in word_index.items():
    if i >= max_features: continue
    if word in embeddings_index:
        embedding_vector = embeddings_index.get_vector(word)
        embedding_matrix[i] = embedding_vector
    else:
        not_in_word2vec += 1
        
print(f'{not_in_word2vec} out of {len(word_index)} has no embedings from word2vec')

634 out of 7285 has no embedings from word2vec


### Trying one word to whole song but one word

In [10]:
train_x, train_y = [], []
for lyric in lyrics:
    for i in range(1, len(lyric)):
        train_x.append(lyric[:i])
        train_y.append(*lyric[i:i+1])
        
train_x = pad_sequences(train_x)
train_y = to_categorical(train_y)
train_x.shape, train_y.shape

((191915, 1577), (191915, 7286))

### Trying sliding window of words

In [10]:
ast=np.lib.index_tricks.as_strided
def generate_sliding_window(arr, window_size=5, window_stride=1, last_window=False):
    last_window = 1 if last_window else 0
    arr = np.ascontiguousarray(arr)
    arr_len = arr.shape[0]
    s, = arr.strides
    windows_num = ((arr_len-window_size)//window_stride) + last_window
    
    return ast(arr, (windows_num, window_size), (s*window_stride, s))

In [11]:
train_x, train_y = [], []
window_size = 10

for lyric in lyrics:
    train_x.append(generate_sliding_window(lyric, window_size))
    train_y.append(lyric[window_size:])
        
train_x = np.concatenate(train_x)
train_y = to_categorical(np.concatenate(train_y))
train_x.shape, train_y.shape

((186380, 10), (186380, 7286))

# building the model

In [12]:
seq_len = train_x.shape[1]

def init_simple():
    inp = Input(shape=(seq_len,))
    
    embd = Embedding(max_features, 
                      embed_size, 
                      weights=[embedding_matrix],
                      input_length=seq_len,
                      name='word_embd')(inp)
    
    lstm = LSTM(128)(embd)
    

    X = Dense(100, activation="relu")(lstm)
    X = Dropout(0.5)(X)
    out = Dense(max_features, activation="softmax", name = 'out')(X)

    model = Model(inp, out)
    
#     model.get_layer('embd').trainable = False

    model.compile(loss='categorical_crossentropy', optimizer=Adam())
    
    return model

In [13]:
model = init_simple()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
word_embd (Embedding)        (None, 10, 300)           2185800   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               219648    
_________________________________________________________________
dense (Dense)                (None, 100)               12900     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
out (Dense)                  (None, 7286)              735886    
Total params: 3,154,234
Trainable params: 3,154,234
Non-trainable params: 0
___________________________________________________

In [15]:
model.fit(train_x, train_y, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

KeyboardInterrupt: 

In [17]:
model.fit(train_x, train_y, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fb549c72a90>