# Everything below has been transfered to a python script

In [1]:
import langdetect
import pandas as pd
import numpy as np
import re

In [2]:
data = pd.read_csv('data/lyrics.csv')
data.drop(['index'], axis=1, inplace=True)
data.head()

Unnamed: 0,song,year,artist,genre,lyrics
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [3]:
def find_null_cols (df):
    """
    :param df: dataframe of songs
    :type  df: pandas.core.frame.DataFrame
    """

    for col in df.columns:
        print ('Column', col, 'has null:', df[col].isnull().values.any())

find_null_cols (data)

Column song has null: True
Column year has null: False
Column artist has null: False
Column genre has null: False
Column lyrics has null: True


In [4]:
data.dropna(inplace=True)

In [5]:
data.head()

Unnamed: 0,song,year,artist,genre,lyrics
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [6]:
find_null_cols (data)

Column song has null: False
Column year has null: False
Column artist has null: False
Column genre has null: False
Column lyrics has null: False


In [7]:
data['artist'].value_counts()

dolly-parton                               744
elton-john                                 676
chris-brown                                628
barbra-streisand                           607
bob-dylan                                  596
bee-gees                                   591
eddy-arnold                                591
eminem                                     578
ella-fitzgerald                            571
david-bowie                                570
american-idol                              568
dean-martin                                560
b-b-king                                   543
celine-dion                                540
elvis-costello                             522
bruce-springsteen                          509
beach-boys                                 495
bill-anderson                              466
eric-clapton                               461
frank-zappa                                457
chumbawamba                                423
chicago      

In [8]:
def clean_str (s):
    s = s.lower()
    s = re.sub('[\s]', ' ', s)
    return s

In [9]:
def create_str(df):
    """
    Create one big string with all the lyrics together.
    
    :param df: Dataframe of lyrics
    :type  df: pandas.core.frame.DataFrame
    :returns: string
    :rtype:   str
    """
    
    lyrics = ''
    for song in df['cleaned lyrics']:
        lyrics += song
    return lyrics

In [44]:
def get_lyrics_of_category(category, description, data):
    """
    Get all the lyrics from a certain category meeting a description.

    Example: 
        category    Description
        year        2010
        artist      Drake
        genre       Pop
    
    :param category: A column name from data
    :type  category: str
    :param description: A subset of category
    :type  description: str
    :param data: Dataframe of songs
    :type  data: pandas.core.frame.DataFrame
    :returns: A string with all the lyrics together.
    :rtype:   str
    """
    
    lyric_data = data[data[category] == description]
    lyric_data['cleaned lyrics'] = lyric_data['lyrics'].apply(
        lambda x: clean_str(x)
    )
    lyrics = create_str(lyric_data)
    return (lyrics)

In [45]:
bob_dylan_lyrics = get_lyrics_of_category('artist', 'bob-dylan', data)
bob_dylan_lyrics[:500]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [43]:
len (bob_dylan_lyrics)

762621

# Training and Text Generation

In [16]:
def vectorizing_seq (text, maxlen, step):    
    """
    
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :returns: (Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character),
               Numpy boolean array of shape 
                    (Number of sequences, number of distinct character),
               dictionary mapping a character to its integer placeholder)
    :rtype:   (numpy.ndarray, 
               numpy.ndarray, 
               dict)     
    """
    
    sentences = [] # hold extracted sequences
    next_chars = [] # hold next characters for each corresponding sentence

    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])

    print('Number of sequences:', len(sentences))

    chars = sorted(list(set(text)))
    print('Unique characters:', len(chars))
    char_indices = dict((char, chars.index(char)) for char in chars)
    print('Vectorization...')

    # one hot encoding the characters into binary arrays
    x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
        
    return x, y, char_indices

In [17]:
# maxlen = 60
# x, y, char_indices = vectorizing_seq(bob_dylan_lyrics, maxlen=60, step=200)
# chars = list (char_indices.keys())

In [46]:
from keras import layers
from keras.models import Sequential
from keras import optimizers


def create_model(x, y, maxlen, epochs, chars):
    """
    Creates and trains a model.
    :param x: Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character)
    :type  x: numpy.ndarray
    :param y: Numpy boolean array of shape 
                    (Number of sequences, number of distinct character)
    :type  y: numpy.ndarray
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param epochs: number of training iterations
    :type  epochs: int
    :param chars: list of unique characters
    :type  chars: list
    :returns: trained keras model
    :rtype:   keras.engine.sequential.Sequential
    """

    model = Sequential()
    model.add(layers.GRU(
        32,
        return_sequences=True,
        input_shape=(maxlen, len(chars)))
    )
    model.add(layers.GRU(
        64,
        input_shape=(maxlen, len(chars)))
    )
    model.add(layers.Dense(len(chars), activation='softmax'))

    print(model.summary())

    optimizer = optimizers.RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    model.fit(x, y, batch_size=128, epochs=epochs)

    return (model)

In [47]:
def train_model_from_lyrics(lyrics, maxlen=60, step=20, epochs=10):
    """
    Given lyrics, train the model.
    
    :param lyrics: A string with all the lyrics together.
    :type  lyrics: str
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :param epochs: number of training iterations
    :type  epochs: int
    :returns: (trained keras model,
               dictionary mapping characters to digit representations)
    :rtype:   (keras.engine.sequential.Sequential,
               dict)
    """
    
    x, y, char_indices = vectorizing_seq(lyrics, maxlen, step)
    chars = list (char_indices.keys())
    model = create_model(x, y, maxlen, epochs, chars)
    
    return model, char_indices

maxlen = 60
model, char_indices = train_model_from_lyrics(bob_dylan_lyrics)

Number of sequences: 38129
Unique characters: 55
Vectorization...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_3 (GRU)                  (None, 60, 32)            8448      
_________________________________________________________________
gru_4 (GRU)                  (None, 64)                18624     
_________________________________________________________________
dense_5 (Dense)              (None, 55)                3575      
Total params: 30,647
Trainable params: 30,647
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [48]:
def sample(preds, temperature=1.0):
    """
    Compute new probability distribution based on the temperature
    Higher temperature creates more randomness.
    
    :param preds: numpy array of shape (unique chars,), and elements sum to 1
    :type  preds: numpy.ndarray
    :param temperature: characterizes the entropy of probability distribution
    :type  temperature: float
    :returns: a number 0 to the length of preds - 1
    :rtype:   int
    """
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [51]:
def text_generate(model, text, char_indices, maxlen=60, temperature=1.0):
    """
    Generate text based on a model.
    
    :param model: trained keras model
    :type  model: keras.engine.sequential.Sequential
    :param text: lyrics
    :type  text: str
    :param char_indices: dictionary mapping a character to its integer placeholder
    :type  char_indices: dict
    :param maxlen: maximum length of the sequences
    :type  maxlen: int
    """
    
    import random
    import sys

    start_index = random.randint(0, len(text) - maxlen - 1) 
    generated_text = text[start_index: start_index + maxlen] 
    print('--- Generating with seed: "' + generated_text + '"')
    
    chars = list (char_indices.keys())
    
    print('------ temperature:', temperature)
    sys.stdout.write(generated_text)
    for i in range(400):
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1
        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]
        generated_text += next_char
        generated_text = generated_text[1:]
        sys.stdout.write(next_char)


In [53]:
text_generate(model, bob_dylan_lyrics, char_indices, maxlen=maxlen)

--- Generating with seed: " might say i'm unlearned but there's one thing i know though"
------ temperature: 1.0
 might say i'm unlearned but there's one thing i know thoughtin' it'me downs we can that senking wills we i any now beent ov bleathen i arouls you ever dody i well be's can leave or plawy out of lut well all you are the black out steat the firful need stecgormebs they well veet a facpllinu comper wents to ling fam uf, ley brond alled cset i was see him stied are a prind of is gase his flang tell you he's he's do the reay is to sel of firling the may that h