In [22]:
import pandas as pd
import numpy as np
import pickle as pkl
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS


UNK_IDX = 0

In [8]:
def load_emb_matrix():
    #load fasttext word vectors
    words_to_load = 50000

    with open('wiki-news-300d-1M-subword.vec') as f:
        #remove the first line
        firstLine = f.readline()
        loaded_embeddings = np.zeros((words_to_load + 2, 300))
        words2id = {}
        idx2words = {}
        #ordered_words = []
        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings[i + 1 , :] = np.asarray(s[1:])
            words2id['<unk>'] = UNK_IDX
            words2id[s[0]] = i + 1
            idx2words[0] = '<pad>'
            idx2words[1] = '<unk>'
            idx2words[i + 1] = s[0]
   

    return words2id,idx2words,loaded_embeddings

In [12]:
words2id,idx2words,loaded_embeddings = load_emb_matrix()

pkl.dump(words2id, open(f'words2id.pkl', 'wb'))
pkl.dump(idx2words, open(f'idx2words.pkl', 'wb'))
pkl.dump(loaded_embeddings, open(f'embedding_matrix.pkl', 'wb'))

In [16]:
# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def tokenize(sent):
    tokens = tokenizer(sent)
    return [token.text.lower() for token in tokens 
            if (token.text not in punctuations) & (token.text not in STOP_WORDS)]

In [17]:
def tokenize_dataset(dataset):
    token_dataset = []
    #all_tokens = []
    for sample in dataset:
        tokens = tokenize(sample)
        token_dataset.append(tokens)
        #all_tokens += tokens

    return token_dataset

In [26]:
# convert token to id in the dataset
def token2index_dataset(tokens_data,words2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [words2id[word] if word in words2id else UNK_IDX for word in tokens]
        indices_data.append(index_list)
    return indices_data

In [18]:
df = pd.read_csv(r'final_matrix.csv')

In [25]:
#lyrics_tokens = tokenize_dataset(df['lyrics'])
pkl.dump(lyrics_tokens, open("lyrics_tokens.p", "wb"))

In [27]:
lyrics_data_indices = token2index_dataset(lyrics_tokens,words2id)

In [28]:
pkl.dump(lyrics_data_indices, open("lyrics_indices.p", "wb"))