### SetUp

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import string
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Get the dataset

In [3]:
data_path = '/home/login/Documents/songdata.csv'
song_dataset = pd.read_csv(data_path, dtype = str)[:10]
song_dataset.sample()

Unnamed: 0,artist,song,link,text
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...


### First 10 songs

#### Preprocessing

In [7]:
def tokenize_corpus(corpus, num_words = -1):
    if num_words > -1:
        tokenizer = Tokenizer(num_words = num_words)
    else:
        tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    return tokenizer

def create_lyrics_corpus(dataset, field):
    dataset[field] = dataset[field].str.replace('[{}]'.format(string.punctuation), '')
    dataset[field] = dataset[field].str.lower()
    # make a long string to split by line
    lyrics = dataset[field].str.cat()
#     print(lyrics)
    corpus = lyrics.split("\n")
#     print(corpus)
    # remove any trailing white spaces
    for _ in range(len(corpus)):
        corpus[_] = corpus[_].rstrip()
    # remove empty lines
    corpus = [_ for _ in corpus if _ != ""]
    return corpus
print(create_lyrics_corpus(song_dataset[:1], 'text'))

["look at her face, it's a wonderful face", 'and it means something special to me', 'look at the way that she smiles when she sees me', 'how lucky can one fellow be?', "she's just my kind of girl, she makes me feel fine", 'who could ever believe that she could be mine?', "she's just my kind of girl, without her i'm blue", 'and if she ever leaves me what could i do, what could i do?', 'and when we go for a walk in the park', 'and she holds me and squeezes my hand', "we'll go on walking for hours and talking", 'about all the things that we plan', "she's just my kind of girl, she makes me feel fine", 'who could ever believe that she could be mine?', "she's just my kind of girl, without her i'm blue", 'and if she ever leaves me what could i do, what could i do?']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[field] = dataset[field].str.replace('[{}]'.format(string.punctuation), '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[field] = dataset[field].str.lower()


In [8]:
corpus = create_lyrics_corpus(song_dataset, 'text')
tokenizer = tokenize_corpus(corpus)

total_words = len(tokenizer.word_index) + 1
print(tokenizer.word_index)
print(total_words)

495


### Create sequences and labels

In [14]:
sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        sequences.append(n_gram_sequence)
        
max_sequence_len = max([len(seq) for seq in sequences])
sequences = np.array(pad_sequences(sequences, maxlen = max_sequence_len, padding = 'pre'))

input_sequences, labels = sequences[:,:-1], sequences[:,-1]
one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes = total_words)
    
print(tokenizer.word_index['know'])   
print(tokenizer.word_index['feeling'])

print(input_sequences[5])
print(input_sequences[6])

print(one_hot_labels[5])
print(one_hot_labels[6])

35
101
[  0   0   0   0   0   0   0   0   0   0   0   0   0  85  86 146 197  33
   2]
[  0   0   0   0   0   0   0   0   0   0   0   0  85  86 146 197  33   2
 285]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0