In [29]:
!pip install pretty_midi

Collecting pretty_midi
  Downloading pretty_midi-0.2.9.tar.gz (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 6.0 MB/s eta 0:00:01
Collecting mido>=1.1.16
  Downloading mido-1.2.9-py2.py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 59 kB/s s eta 0:00:01
Building wheels for collected packages: pretty-midi
  Building wheel for pretty-midi (setup.py) ... [?25ldone
[?25h  Created wheel for pretty-midi: filename=pretty_midi-0.2.9-py3-none-any.whl size=5591952 sha256=6ae2e0c9ac02a53188789f433a81a99b309fba3c1ac53900ed0acbb47e3224c2
  Stored in directory: /home/naorko/.cache/pip/wheels/2a/5a/e3/30eeb9a99350f3f7e21258fcb132743eef1a4f49b3505e76b6
Successfully built pretty-midi
Installing collected packages: mido, pretty-midi
Successfully installed mido-1.2.9 pretty-midi-0.2.9


In [99]:
import pandas as pd
import numpy as np
 
# TensorFlow
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate
from tensorflow.keras.layers import Dropout, Dense, Lambda, Multiply, Subtract, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Activation, Reshape
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Scikit-learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# XGBoost
from xgboost import XGBRegressor

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Misc.
import os
import joblib
import random
import time
from tqdm import tqdm_notebook as tqdm

SEED = 42
%matplotlib inline

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [1]:
import pretty_midi
import pandas as pd
import numpy as np

In [2]:
cols = ['Singer', 'Song Name', 'Lyrics']
df_train = pd.read_csv('datasets/lyrics_train_set.csv', names=cols)
df_test = pd.read_csv('datasets/lyrics_test_set.csv', names=cols)

In [3]:
df_train

Unnamed: 0,Singer,Song Name,Lyrics
0,elton john,candle in the wind,goodbye norma jean & though i never knew you a...
1,gerry rafferty,baker street,winding your way down on baker street & lite i...
2,gerry rafferty,right down the line,you know i need your love & you've got that ho...
3,2 unlimited,tribal dance,come on check it out ya'll & (come on come on!...
4,2 unlimited,let the beat control your body,let the beat control your body & let the beat ...
...,...,...,...
610,don henley,dirty laundry,i make my living off the evening news & just g...
611,don henley,new york minute,harry got up & dressed all in black & went dow...
612,bob dylan,subterranean homesick blues,johnny's in the basement & mixing up the medic...
613,goldfinger,mable,i met her sunday that was yesterday & the girl...


In [4]:
df_test

Unnamed: 0,Singer,Song Name,Lyrics
0,the bangles,eternal flame,close your eyes give me your hand darling & d...
1,billy joel,honesty,if you search for tenderness & it isn't hard ...
2,cardigans,lovefool,dear i fear we're facing a problem & you love...
3,aqua,barbie girl,hiya barbie & hi ken! & do you want to go for...
4,blink 182,all the small things,all the small things & true care truth brings...


In [5]:
pm = pretty_midi.PrettyMIDI('datasets/midi_files/aladdin_-_A_whole_new_world.mid')
pm

<pretty_midi.pretty_midi.PrettyMIDI at 0x7fca1165ce80>

In [6]:
print('There are {} time signature changes'.format(len(pm.time_signature_changes)))
print('There are {} instruments'.format(len(pm.instruments)))
print('Instrument 3 has {} notes'.format(len(pm.instruments[0].notes)))
print('Instrument 4 has {} pitch bends'.format(len(pm.instruments[4].pitch_bends)))
print('Instrument 5 has {} control changes'.format(len(pm.instruments[5].control_changes)))

There are 1 time signature changes
There are 9 instruments
Instrument 3 has 227 notes
Instrument 4 has 0 pitch bends
Instrument 5 has 0 control changes


In [8]:
l = df_train.iloc[0, 2]
l

'goodbye norma jean & though i never knew you at all & you had the grace to hold yourself & while those around you crawled & they crawled out of the woodwork & and they whispered into your brain & they set you on the treadmill & and they made you change your name & and it seems to me you lived your life & like a candle in the wind & never knowing who to cling to & when the rain set in & and i would liked to have known you & but i was just a kid & your candle burned out long before & your legend ever did & loneliness was tough & the toughest role you ever played & hollywood created a superstar & and pain was the price you paid & even when you died & oh the press still hounded you & all the papers had to say & was that marilyn was found in the nude & and it seems to me you lived your life & like a candle in the wind & never knowing who to cling to & when the rain set in & and i would liked to have known you & but i was just a kid & your candle burned out long before & your legend ever di

In [26]:
from nltk.stem.porter import PorterStemmer
# from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')


# wml = WordNetLemmatizer()
ps = PorterStemmer()
sw = set(stopwords.words('english'))

def stemming(data):
    tokens = word_tokenize(data)
    stemmed_data = []
    
    for t in tokens:
        # Use only words, character combinations and numbers 
        if not t.isalpha() or len(t) == 1: 
            continue
            
        # Lower case word
        t = t.lower()
        
#         # Remove stop words
#         if t in sw: 
#             continue
        
#         # Stem word
#         stem_t = ps.stem(t) 
# #         stem_t = wml.lemmatize(t)
        
        stemmed_data.append(t)
        
    return ' '.join(stemmed_data)

def preprocess_lyrics(data):
#     cleared_text = camel_case_split(striphtml(data)).replace('  ', ' ')
    stemmed_text = stemming(data)
    return stemmed_text

[nltk_data] Downloading package stopwords to /home/naorko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/naorko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [57]:
df_train.iloc[8,2]

"[chorus:]   & oh i'm bein' followed by a moonshadow moon shadow moonshadow---   & leapin and hoppin' on a moonshadow moonshadow moonshadow---   &    & and if i ever lose my hands lose my plough lose my land   & oh if i ever lose my hands oh if i won't have to work no more.   &    & and if i ever lose my eyes if my colours all run dry   & yes if i ever lose my eyes oh if i won't have to cry no more.   &    & [chorus]   &    & and if i ever lose my legs i won't moan and i won't beg   & yes if i ever lose my legs oh if i won't have to walk no more.   &    & and if i ever lose my mouth all my teeth north and south   & yes if i ever lose my mouth oh if i won't have to talk...   &    & did it take long to find me? i asked the faithful light.   & did it take long to find me? and are you gonna stay the night?   &    & [chorus]   & moonshadow moonshadow moonshadow moonshadow. &"

In [28]:
preprocess_lyrics(df_train.iloc[8,2])

'chorus oh bein followed by moonshadow moon shadow moonshadow leapin and hoppin on moonshadow moonshadow moonshadow and if ever lose my hands lose my plough lose my land oh if ever lose my hands oh if wo have to work no more and if ever lose my eyes if my colours all run dry yes if ever lose my eyes oh if wo have to cry no more chorus and if ever lose my legs wo moan and wo beg yes if ever lose my legs oh if wo have to walk no more and if ever lose my mouth all my teeth north and south yes if ever lose my mouth oh if wo have to talk did it take long to find me asked the faithful light did it take long to find me and are you gon na stay the night chorus moonshadow moonshadow moonshadow moonshadow'

In [72]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"in\'", "ing", phrase)
    
    # punctions
    regex = re.compile('[^a-zA-Z& ]')
    #First parameter is the replacement, second parameter is your input string
    phrase = regex.sub('', phrase)
    
    return phrase

In [69]:
df_train.iloc[8,2]

"[chorus:]   & oh i'm bein' followed by a moonshadow moon shadow moonshadow---   & leapin and hoppin' on a moonshadow moonshadow moonshadow---   &    & and if i ever lose my hands lose my plough lose my land   & oh if i ever lose my hands oh if i won't have to work no more.   &    & and if i ever lose my eyes if my colours all run dry   & yes if i ever lose my eyes oh if i won't have to cry no more.   &    & [chorus]   &    & and if i ever lose my legs i won't moan and i won't beg   & yes if i ever lose my legs oh if i won't have to walk no more.   &    & and if i ever lose my mouth all my teeth north and south   & yes if i ever lose my mouth oh if i won't have to talk...   &    & did it take long to find me? i asked the faithful light.   & did it take long to find me? and are you gonna stay the night?   &    & [chorus]   & moonshadow moonshadow moonshadow moonshadow. &"

In [76]:
string = df_train.iloc[8,2]
tokenized_string = nltk.word_tokenize(decontracted(string))
for token in tokenized_string:
    if token == '&':
        print()
    else:
        print(token, end=' ')

chorus 
oh i am being followed by a moonshadow moon shadow moonshadow 
leapin and hopping on a moonshadow moonshadow moonshadow 

and if i ever lose my hands lose my plough lose my land 
oh if i ever lose my hands oh if i will not have to work no more 

and if i ever lose my eyes if my colours all run dry 
yes if i ever lose my eyes oh if i will not have to cry no more 

chorus 

and if i ever lose my legs i will not moan and i will not beg 
yes if i ever lose my legs oh if i will not have to walk no more 

and if i ever lose my mouth all my teeth north and south 
yes if i ever lose my mouth oh if i will not have to talk 

did it take long to find me i asked the faithful light 
did it take long to find me and are you gon na stay the night 

chorus 
moonshadow moonshadow moonshadow moonshadow 


In [104]:
lyrics = df_train['Lyrics'].apply(lambda s: nltk.word_tokenize(decontracted(s)))
lyrics

0      [goodbye, norma, jean, &, though, i, never, kn...
1      [winding, your, way, down, on, baker, street, ...
2      [you, know, i, need, your, love, &, you, have,...
3      [come, on, check, it, out, ya, will, &, come, ...
4      [let, the, beat, control, your, body, &, let, ...
                             ...                        
610    [i, make, my, living, off, the, evening, news,...
611    [harry, got, up, &, dressed, all, in, black, &...
612    [johnny, is, in, the, basement, &, mixing, up,...
613    [i, met, her, sunday, that, was, yesterday, &,...
614    [yall, know, me, still, the, same, og, but, i,...
Name: Lyrics, Length: 615, dtype: object

b. Create embeddings

In [88]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer().
tokenizer.fit_on_texts(lyrics)

In [105]:
lyrics = tokenizer.texts_to_sequences(lyrics)
lyrics

[[214,
  2380,
  2381,
  1,
  316,
  2,
  54,
  201,
  3,
  69,
  26,
  1,
  3,
  122,
  4,
  1275,
  5,
  169,
  309,
  1,
  286,
  266,
  103,
  3,
  2022,
  1,
  42,
  2022,
  50,
  16,
  4,
  2940,
  1,
  6,
  42,
  2382,
  155,
  17,
  969,
  1,
  42,
  404,
  3,
  21,
  4,
  3978,
  1,
  6,
  42,
  239,
  3,
  257,
  17,
  170,
  1,
  6,
  9,
  450,
  5,
  10,
  3,
  1085,
  17,
  82,
  1,
  38,
  8,
  1021,
  13,
  4,
  415,
  1,
  54,
  510,
  88,
  5,
  1787,
  5,
  1,
  35,
  4,
  290,
  404,
  13,
  1,
  6,
  2,
  60,
  1276,
  5,
  29,
  533,
  3,
  1,
  31,
  2,
  53,
  33,
  8,
  619,
  1,
  17,
  1021,
  1788,
  50,
  123,
  144,
  1,
  17,
  2023,
  106,
  114,
  1,
  695,
  53,
  1600,
  1,
  4,
  3979,
  2024,
  3,
  106,
  812,
  1,
  813,
  2941,
  8,
  1601,
  1,
  6,
  467,
  53,
  4,
  1140,
  3,
  1141,
  1,
  193,
  35,
  3,
  1086,
  1,
  28,
  4,
  2383,
  112,
  3980,
  3,
  1,
  26,
  4,
  1200,
  122,
  5,
  56,
  1,
  53,
  15,
  2384,
  53,
  261,
  13,


In [100]:
EMBEDDING_FILE = './GoogleNews-vectors-negative300.bin'

if not os.path.isfile(EMBEDDING_FILE):
    !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
    !gzip -f -d GoogleNews-vectors-negative300.bin.gz

In [106]:
from gensim import models

embeddings_index = models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
embed_size = 300
word_index = tokenizer.word_index
max_features = len(word_index) + 1

nb_words = len(word_index)
embedding_matrix = (np.random.rand(nb_words+1, embed_size) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= max_features: continue
    if word in embeddings_index:
        embedding_vector = embeddings_index.get_vector(word)
        embedding_matrix[i] = embedding_vector

In [107]:
embedding_matrix.shape

(7286, 300)