In [21]:
%load_ext autoreload
%autoreload 2

In [64]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk # just for tokenization
from nltk.corpus import stopwords

# Milestones
1. Get the word2vec and glove vectors
2. Tokenize the lyrics corpus
3. Get the unique tokens and index them (not needed, may be)
4. Get word embeddings of each token in the lyrics, and do linear superposition every word vector to create a single vector for a lyric. Do this for all lyrics
5. Use clustering techniques 
    - Start with Kmeans for setting the baseline then try out other clustering techniques like DBSCAN.
6. Calculate the Silhoutte distance, WCSS for the model for comparison

# Loading Glove vectors

In [50]:
def load_embedding_model():
    """ Load GloVe Vectors
        Return:
            wv_from_bin: All 400000 embeddings, each lengh 200
    """
    import gensim.downloader as api
    wv_from_bin = api.load("word2vec-google-news-300")
    print("Loaded vocab size %i" % len(list(wv_from_bin.index_to_key)))
    return wv_from_bin
wv_from_bin = wv_from_bin if (wv_from_bin!=None) else load_embedding_model()
# wv_from_bin = load_embedding_model()

Loaded vocab size 3000000


# Vectorize the lyrics 

In [10]:
raw_df = pd.read_csv("../../Spotify Song Dataset.csv/million_songs.csv")

In [18]:
print(raw_df.head())
print(f"Number of songs {len(raw_df)}")

  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
3   ABBA                   Bang                  /a/abba/bang_20598415.html   
4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   

                                                text  
0  Look at her face, it's a wonderful face  \nAnd...  
1  Take it easy with me, please  \nTouch me gentl...  
2  I'll never know why I had to go  \nWhy I had t...  
3  Making somebody happy is a question of give an...  
4  Making somebody happy is a question of give an...  
Number of songs 57650


In [86]:
len(raw_df)

57650

In [17]:
len(raw_df["song"].unique()) # Name of the songs are not unique

44824

In [73]:
def tokenize(lyric: str) -> list[str]:
    # lowercase the text, remove stop words, punctuation and keep only the words
    tokens = nltk.tokenize.word_tokenize(lyric.lower())
    stop_words = stopwords.words("english")
    alpha_tokens = [token for token in tokens if (token.isalpha() and token not in stop_words)]
    return alpha_tokens

In [84]:
# go through each lyrics, tokenize it, vectorize each word, then combine all of them into single average vector and store it in the list
spacy_tokenizer = spacy.load("en_core_web_sm")
lyrics = raw_df["text"]
lyrics_embeddings = []
unsupported_tokens = set()
for lyric in tqdm(lyrics):
    lyric_vector = np.zeros(300)
    for token in tokenize(lyric):
        try:
            lyric_vector += wv_from_bin.get_vector(token.lower())
        except KeyError as e:
            # if the word is not present in the glove then key error is raised, so handle the exception and move on
            unsupported_tokens.add(token)
            continue
    lyrics_embeddings.append(lyric_vector)

lyrics_embeddings = np.stack(lyrics_embeddings)


100%|██████████| 57650/57650 [01:46<00:00, 541.51it/s]


In [106]:
print(f"Embeddings shape {lyrics_embeddings.shape}, number of unsupported tokens {len(unsupported_tokens)}")
# scaling along each lyrics vector, for unit variance
scaled_lyrics_embeddings = lyrics_embeddings / np.linalg.norm(lyrics_embeddings, axis=1, keepdims=True)

Embeddings shape (57650, 300), number of unsupported tokens 32513
