In [1]:
%load_ext autoreload
%autoreload 2

In [434]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk # just for tokenization
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random
import gradio 
import string

random.seed(42)

In [3]:
def load_embedding_model():
    """ Load GloVe Vectors
        Return:
            wv_from_bin: All 400000 embeddings, each lengh 200
    """
    import gensim.downloader as api
    wv_from_bin = api.load("word2vec-google-news-300")
    print("Loaded vocab size %i" % len(list(wv_from_bin.index_to_key)))
    return wv_from_bin
wv_from_bin = load_embedding_model()
# wv_from_bin = load_embedding_model()

Loaded vocab size 3000000


In [1054]:
raw_df = pd.read_csv("MultiLabel.csv")
print(raw_df.head())

           artist    genre                  title                    album  \
0         Nirvana     Rock  You Know You’re Right                  Nirvana   
1   Damian Marley   Reggae             Here We Go               Stony Hill   
2  The Mission UK     Rock                   Jade  Another Fall from Grace   
3            UB40   Reggae       Food For Thought              Signing Off   
4     Johnny Cash  Country   I’ve Been Everywhere   American II: Unchained   

     year                                             lyrics  \
0  2002.0  I will never bother you\nI will never promise ...   
1  2017.0  Here we go\nMy big ego is gonna get me in trou...   
2  2016.0  She came as Lolita dressed as Venus\nAnd adorn...   
3  1980.0  Ivory Madonna, dying in the dust\nWaiting for ...   
4  1996.0  I was totin' my pack along the dusty Winnemucc...   

                                       labels  
0                           Calmness, Sadness  
1                              Power, Tension  
2 

In [256]:
len(raw_df)

1160

In [257]:
unique_emotion_labels = set()

In [None]:
[[unique_emotion_labels.add(label.strip()) for label in labels.split(",") ]for labels in raw_df["labels"]]
# raw_df["labels"].unique()

In [1036]:
unique_emotion_labels

{'Amazement',
 'Calmness',
 'Joyful activation',
 'Nostalgia',
 'Power',
 'Sadness',
 'Solemnity',
 'Tenderness',
 'Tension'}

In [801]:
def tokenize(lyric: str) -> list[str]:
    # lowercase the text, remove stop words, punctuation and keep only the words
    tokens = nltk.tokenize.word_tokenize(lyric.lower())
    stop_words = stopwords.words("english") + list(string.punctuation)
    lemmatizer = WordNetLemmatizer()
    alpha_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]

    return alpha_tokens

In [803]:
# go through each lyrics, tokenize it, vectorize each word, then combine all of them into single average vector and store it in the list
lyrics = raw_df["lyrics"]
lyrics_embeddings = []
unsupported_tokens = set()
label_embedding_map = {} # dict{str: np.array([])}
for lyric in tqdm(lyrics):
    lyric_vector = np.zeros(300)
    for token in tokenize(lyric):
        try:
            lyric_vector += wv_from_bin.get_vector(token.lower())
        except KeyError as e:
            # if the word is not present in the glove then key error is raised, so handle the exception and move on
            unsupported_tokens.add(token)
            continue
    lyrics_embeddings.append(lyric_vector)


lyrics_embeddings = np.stack(lyrics_embeddings)
scaled_lyrics_embeddings = lyrics_embeddings / np.linalg.norm(lyrics_embeddings, axis=1, keepdims=True)

100%|██████████| 1160/1160 [00:06<00:00, 179.82it/s]


# Prediction

In [804]:
def vectorise(lyrics: str) -> np.array:
    tokens = tokenize(lyrics)
    lyric_vector = np.zeros(300)
    for token in tokens:
        try:
            lyric_vector += wv_from_bin.get_vector(token.lower())
        except:
            continue
    return lyric_vector / np.linalg.norm(lyric_vector)
    

In [1037]:
test_lyric = """
It might seem crazy what I am 'bout to say
Sunshine, she's here, you can take a break
I'm a hot air balloon that could go to space
With the air, like I don't care, baby by the way
Huh (Because I'm happy)
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Here come bad news talking this and that (Yeah)
Well give me all you got, don't hold back (Yeah)
Well I should probably warn you I'll be just fine (Yeah)
No offense to you don't waste your time
Here's why
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Uh, bring me down
Can't nothing, bring me down
My level's too high to bring me down
Can't nothing, bring me down, I said
Bring me down, can't nothing
Bring me down
My level's too high to bring me down
Can't nothing, bring me down, I said
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Uh, bring me down (Happy, happy, happy, happy)
Can't nothing (Happy, happy, happy, happy)
Bring me down, my level's too high
To bring me down (Happy, happy, happy, happy)
Can't nothing (Happy, happy, happy, happy)
Bring me down, I said
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you (ayy, ayy, ayy)
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you (hey)
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Come on
"""

In [1038]:
test1_vector = vectorise(test_lyric)

In [1039]:
scaled_lyrics_embeddings.shape

(1160, 300)

In [1040]:
test1_vector.shape

(300,)

In [1041]:
def distance(metric: str, embedding_matrix: np.ndarray, test_vector: np.ndarray) -> np.ndarray:
    if metric == "cosine":
            
        dot_product = np.dot(embedding_matrix, test_vector)

        # Compute magnitudes
        embedding_magnitudes = np.linalg.norm(embedding_matrix, axis=1)
        test_vector_magnitude = np.linalg.norm(test_vector)

        # Compute cosine similarity
        cosine_similarity = dot_product / (embedding_magnitudes * test_vector_magnitude)
        return cosine_similarity
    elif metric == "euclidean":
        distances = np.linalg.norm(scaled_lyrics_embeddings - test_vector, axis=1)
        return distances
    else:
        raise Exception(f"Invalid parameter value {metric}")

In [1042]:
unique_emotion_labels

{'Amazement',
 'Calmness',
 'Joyful activation',
 'Nostalgia',
 'Power',
 'Sadness',
 'Solemnity',
 'Tenderness',
 'Tension'}

In [1043]:
test_vector = np.array([0.5, 0.6, 0.7])
test_vector.shape

(3,)

In [1044]:
similarity = distance(metric="euclidean", embedding_matrix=scaled_lyrics_embeddings, test_vector= test1_vector.reshape(1,-1))

In [1045]:
cs = distance(metric="cosine", embedding_matrix=scaled_lyrics_embeddings, test_vector= test1_vector)

In [1046]:
real = cosine_similarity(scaled_lyrics_embeddings, test1_vector.reshape(1,-1))[:,0]
raw_df.loc[(-real).argsort()[:3]]["labels"]

722                         Tenderness, Tension
242                 Amazement, Nostalgia, Power
302    Amazement, Joyful activation, Tenderness
Name: labels, dtype: object

In [1047]:
real.max(), real.argmax()

(0.8573437826814737, 722)

In [1048]:
cs.max(), cs.argmax()

(0.8573437826814735, 722)

In [1049]:
raw_df.loc[(-cs).argsort()[:3]]["labels"]

722                         Tenderness, Tension
242                 Amazement, Nostalgia, Power
302    Amazement, Joyful activation, Tenderness
Name: labels, dtype: object

In [1050]:
similarity.max(), similarity.argmax()

(1.1363610055106679, 159)

In [1051]:
(-similarity).argsort()

array([159, 970, 112, ..., 302, 242, 722], dtype=int64)

In [1052]:
raw_df.loc[(-similarity).argsort()[:10]]["labels"]

159                                      Sadness, Tension
970                Nostalgia, Sadness, Solemnity, Tension
112     Calmness, Joyful activation, Power, Sadness, T...
407                                   Amazement, Calmness
628       Amazement, Nostalgia, Power, Solemnity, Tension
115     Nostalgia, Sadness, Solemnity, Tenderness, Ten...
509          Joyful activation, Power, Solemnity, Tension
745                               Power, Sadness, Tension
52      Amazement, Joyful activation, Nostalgia, Sadne...
1098                              Power, Sadness, Tension
Name: labels, dtype: object

# Combining embeddings for each emotion

In [881]:
# go through each lyrics, tokenize it, vectorize each word, then combine all of them into single average vector and store it in the list
lyrics = raw_df["lyrics"]
lyrics_embeddings = []
unsupported_tokens = set()
label_embedding_map = {} # dict{str: np.array([])}
for i, lyric in tqdm(enumerate(lyrics)):
    labels = raw_df.loc[i]["labels"]
    lyric_vector = np.zeros(300)
    for token in tokenize(lyric):
        try:
            lyric_vector += wv_from_bin.get_vector(token.lower())
        except KeyError as e:
            # if the word is not present in the glove then key error is raised, so handle the exception and move on
            unsupported_tokens.add(token)
            continue

    for label in [label.strip() for label in labels.split(",")]:
        if label in label_embedding_map:
            label_embedding_map[label] += lyric_vector
        else:
            label_embedding_map[label] = lyric_vector

1160it [00:03, 313.33it/s]


In [882]:
emotions_embeddings = []
for k,v in label_embedding_map.items():
    v = v / np.linalg.norm(v, axis=0)
    label_embedding_map[k] = v
    emotions_embeddings.append(v)

emotions_embeddings = np.stack(emotions_embeddings)

In [883]:
emotions_embeddings.shape

(9, 300)

In [1033]:
test_lyric = """
It might seem crazy what I am 'bout to say
Sunshine, she's here, you can take a break
I'm a hot air balloon that could go to space
With the air, like I don't care, baby by the way
Huh (Because I'm happy)
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Here come bad news talking this and that (Yeah)
Well give me all you got, don't hold back (Yeah)
Well I should probably warn you I'll be just fine (Yeah)
No offense to you don't waste your time
Here's why
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Uh, bring me down
Can't nothing, bring me down
My level's too high to bring me down
Can't nothing, bring me down, I said
Bring me down, can't nothing
Bring me down
My level's too high to bring me down
Can't nothing, bring me down, I said
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Uh, bring me down (Happy, happy, happy, happy)
Can't nothing (Happy, happy, happy, happy)
Bring me down, my level's too high
To bring me down (Happy, happy, happy, happy)
Can't nothing (Happy, happy, happy, happy)
Bring me down, I said
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you (ayy, ayy, ayy)
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Clap along if you feel like a room without a roof
(Because I'm happy)
Clap along if you feel like happiness is the truth
(Because I'm happy)
Clap along if you know what happiness is to you (hey)
(Because I'm happy)
Clap along if you feel like that's what you wanna do
Come on
"""

In [1055]:
test1_vector = vectorise(raw_df.loc[0]["lyrics"])
cs = distance(metric="cosine", embedding_matrix=emotions_embeddings, test_vector=test1_vector)

In [1035]:
np.array(list(label_embedding_map.keys()))[(-cs).argsort()[:3]]

array(['Amazement', 'Solemnity', 'Tenderness'], dtype='<U17')

In [1056]:
np.save("./embeddings/emotions_embeddings_v1.npy", emotions_embeddings)

In [1057]:
label_embedding_map.keys()

dict_keys(['Calmness', 'Sadness', 'Power', 'Tension', 'Amazement', 'Solemnity', 'Tenderness', 'Joyful activation', 'Nostalgia'])