# **Spotify Song Recommendation System with Static Embedding**

In [1]:
!pip install -q nltk

In [2]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import nltk
from google.colab import drive
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [3]:
drive.mount('/content/drive', force_remount=True)

%cd 'drive/My Drive/PORTFOLIOS/NLP-PROJECTS/SONG-RECOMMENDATION-SYSTEM'

Mounted at /content/drive
/content/drive/My Drive/PORTFOLIOS/NLP-PROJECTS/SONG-RECOMMENDATION-SYSTEM


In [4]:
%ls data

songdata.csv


## **1 Prepare Dataset**

**Step-1**: Import dataset

In [5]:
df = pd.read_csv('data/songdata.csv',
                 usecols=['artist', 'song', 'text']).rename(columns={'text':'lyric'})
df.head()

Unnamed: 0,artist,song,lyric
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


The dataset was downloaded from [Kaggle](https://www.kaggle.com/datasets/noorsaeed/songs-recommendation-dataset)

**Step 2**: Inspect dataset

In [6]:
# function to inspect dataframe
def inspect_dataframe(df):
    summary = {
        'ColumnName': df.columns.values.tolist(),
        'Nrow': df.shape[0],
        'DataType': df.dtypes.values.tolist(),
        'NAPct': (df.isna().mean()*100).round(2).tolist(),
        'DuplicatePct': (df.duplicated().sum()/len(df)*100).round(2),
        'UniqueValues': df.nunique().tolist(),
        'Sample': [df[col].unique() for col in df.columns]
    }
    return pd.DataFrame(summary)

In [7]:
# apply dataset inspection
inspect_dataframe(df)

Unnamed: 0,ColumnName,Nrow,DataType,NAPct,DuplicatePct,UniqueValues,Sample
0,artist,57650,object,0.0,0.0,643,"[ABBA, Ace Of Base, Adam Sandler, Adele, Aeros..."
1,song,57650,object,0.0,0.0,44824,"[Ahe's My Kind Of Girl, Andante, Andante, As G..."
2,lyric,57650,object,0.0,0.0,57494,"[Look at her face, it's a wonderful face \nAn..."


In [8]:
lyric_duplicates_df = (
    df[df.duplicated(subset='lyric', keep=False)].sort_values(by='lyric')
)

print(f'Total duplicated lyrics: {lyric_duplicates_df.shape[0]}')
with pd.option_context('display.max_colwidth', None):
  display(lyric_duplicates_df.head(2))

Total duplicated lyrics: 290


Unnamed: 0,artist,song,lyric
21829,XTC,All Along The Watchtower,"""There must be some way out of here,"" said the joker to the thief, \n""There's too much confusion, I can't get no relief. \nBusinessmen, they drink my wine, plowmen dig my earth, \nNone of them along the line know what any of it is worth."" \n \n""No reason to get excited,"" the thief, he kindly spoke, \n""There are many here among us who feel that life is but a joke. \nBut you and I, we've been through that, and this is not our fate, \nSo let us not talk falsely now, the hour is getting late."" \n \nAll along the watchtower, princes kept the view \nWhile all the women came and went, barefoot servants, too. \n \nOutside in the distance a wildcat did growl, \nTwo riders were approaching, the wind began to howl.\n\n"
25758,Bob Dylan,All Along The Watchtower,"""There must be some way out of here,"" said the joker to the thief, \n""There's too much confusion, I can't get no relief. \nBusinessmen, they drink my wine, plowmen dig my earth, \nNone of them along the line know what any of it is worth."" \n \n""No reason to get excited,"" the thief, he kindly spoke, \n""There are many here among us who feel that life is but a joke. \nBut you and I, we've been through that, and this is not our fate, \nSo let us not talk falsely now, the hour is getting late."" \n \nAll along the watchtower, princes kept the view \nWhile all the women came and went, barefoot servants, too. \n \nOutside in the distance a wildcat did growl, \nTwo riders were approaching, the wind began to howl.\n\n"


In [9]:
df.drop_duplicates(subset='lyric', keep='first', inplace=True)
print(f'Total rows after dropping duplicates: {df.shape[0]}')

Total rows after dropping duplicates: 57494


**Step-3**: Preprocess column lyric

In [10]:
# download English stopwords & wordnet
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

True

In [12]:
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_lyrics(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return [word for word in tokens if word not in STOPWORDS]

In [13]:
%%time

# apply preprocessing function
df['tokenized_lyrics'] = df['lyric'].apply(preprocess_lyrics)

CPU times: user 2min 5s, sys: 1.64 s, total: 2min 7s
Wall time: 2min 43s


In [14]:
# view output
with pd.option_context('display.max_colwidth', None):
  display(df.iloc[0:1, 0:4])

Unnamed: 0,artist,song,lyric,tokenized_lyrics
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd it means something special to me \nLook at the way that she smiles when she sees me \nHow lucky can one fellow be? \n \nShe's just my kind of girl, she makes me feel fine \nWho could ever believe that she could be mine? \nShe's just my kind of girl, without her I'm blue \nAnd if she ever leaves me what could I do, what could I do? \n \nAnd when we go for a walk in the park \nAnd she holds me and squeezes my hand \nWe'll go on walking for hours and talking \nAbout all the things that we plan \n \nShe's just my kind of girl, she makes me feel fine \nWho could ever believe that she could be mine? \nShe's just my kind of girl, without her I'm blue \nAnd if she ever leaves me what could I do, what could I do?\n\n","[look, face, wonderful, face, mean, something, special, look, way, smile, see, lucky, one, fellow, shes, kind, girl, make, feel, fine, could, ever, believe, could, mine, shes, kind, girl, without, im, blue, ever, leaf, could, could, go, walk, park, hold, squeeze, hand, well, go, walking, hour, talking, thing, plan, shes, kind, girl, make, feel, fine, could, ever, believe, could, mine, shes, kind, girl, without, im, blue, ever, leaf, could, could]"


## **2 Modeling**

**Step-1**: Train Word2Vec model

In [15]:
%%time

# train model on lyrics
word2vec_model = Word2Vec(sentences=df['tokenized_lyrics'],
                          vector_size=100,
                          window=5,
                          min_count=5,
                          workers=4)

CPU times: user 1min 46s, sys: 599 ms, total: 1min 47s
Wall time: 1min 5s


**Note**: The parameter `min_counts=5` is used to filter out infrequent words which can introduce noise to the model.

In [16]:
# print model
print(word2vec_model)

Word2Vec<vocab=26239, vector_size=100, alpha=0.025>


**Step-2**: Generate song vectors for song similarity search

In [17]:
# generate song vector by averaging word song vectors
def get_song_vector(lyrics):
    vectors = [word2vec_model.wv[word] for word in lyrics if word in word2vec_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        # if no words are found, return zero vector
        return np.zeros(100)

In [18]:
%%time

# apply song vector generation
df['song_vector'] = df['tokenized_lyrics'].apply(get_song_vector)

CPU times: user 14.6 s, sys: 86.9 ms, total: 14.7 s
Wall time: 16.9 s


**Step-4**: Get recommendations by song lyric similarity

In [19]:
# function to recommend songs based on cosine similarity of song vectors
def get_recommendations(song_title, top_n=5):
    # normalize the song title to lowercase
    song_title = song_title.lower()

    # find index of song
    idx = df[df['song'].str.lower() == song_title].index[0]
    target_vector = df['song_vector'].iloc[idx].reshape(1, -1)

    # compute cosine similarity between target song and other songs
    similarity_scores = cosine_similarity(
        target_vector, np.vstack(df['song_vector'].values)
    )

    # get top similar songs
    sim_scores = list(enumerate(similarity_scores[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # exclude target song itself
    sim_scores = sim_scores[1:top_n+1]
    song_indices = [i[0] for i in sim_scores]
    top_similarity_scores = [x[1] for x in sim_scores]

    # create df with song details and similarity scores
    recommendations = df[['artist', 'song']].iloc[song_indices].copy()
    recommendations['similarity_score'] = top_similarity_scores

    return recommendations

In [20]:
# specify song to search
to_search = df.sample(n=1, random_state=42)['song'].to_string(index=False)
print(f'Song to search: {to_search}')

Song to search: Shirley Do You Own A Ferrari?


In [21]:
# get recommendation
recommendations = get_recommendations(to_search, top_n=5)
display(recommendations)

Unnamed: 0,artist,song,similarity_score
4155,Devo,Dawghaus,0.89895
48904,Poison,Your Mama Don't Dance,0.894072
9923,Judds,Blue Nun Cafe,0.892535
54492,Unwritten Law,Slow Dance,0.889466
20436,Van Halen,Dance The Night Away,0.881183


In [22]:
with pd.option_context('display.max_colwidth', None):
  display(
      df[
          (df['song'] == to_search) |
          (df.index == recommendations.head(1).index.values[0])
       ].loc[:, ['artist', 'song', 'lyric']]
)

Unnamed: 0,artist,song,lyric
2855,Chris Rea,Shirley Do You Own A Ferrari?,When the grey skies turn to blue \nAnd the dark clouds blow away \nIn the morning of a new life \nWhen the sun comes shining through \nWhen the grey skies turn to blue \nIn the morning of a new life \nWhen the sun comes shining through \nWhen the grey skies turn to blue \nMeet me on a bright and windy day \nWhen the breeze has blown \nThe gray skies far away \nHigh upon a hillside \nWhen the sun comes shining through \nAnd the grey skies turn to blue \nWhen the grey skies turn to blue \nWhen the grey skies turn to blue \n\n
4155,Devo,Dawghaus,When I woke up this morning \nI wasn't in my bed \nYou know I had an itchy itch feeling \nRunning round my head \n \nI looked at my watch \nAnd this is what it said \nYou got them dawghouse blues \nGot them in your shoes \n \nBoy you know you're bound to lose \nWell I got up \nAnd header home right away \nYou know I was wondering \n \nWhy my baby would say \nShe said wait! \nShe said wait! \nWell you been out all night \n \nGot into a fight \nAnd now you got them dawghouse blues \nI said baby baby baby \nBaby please don't go \n \nI said baby baby baby \nThere's something you should know \nWell that trick you do \nIt's not so new \n \nSomeday them dawghouse blues are gonna \nGet to you \nYou know my woman was waitin' \nStick in hand \n \nShe beat me good (whew) \nNow I don't understand \nI thought there was freedom \nIn this here land \n \nI got them dawghouse blues \nGot 'em in my shoes \nMan you know I'm gonna lose \nI said \nI said\n\n


## **3 Predictions with Explanation**

In [27]:
def get_recommendations_with_explanation(song_title, top_n=5, num_keywords=5):
    song_title = song_title.lower()
    # find index of input song
    input_idx = df[df['song'].str.lower() == song_title].index[0]
    target_vector = df['song_vector'].iloc[input_idx]
    # compute cosine similarity
    all_vectors = np.vstack(df['song_vector'].values)
    similarity_scores = cosine_similarity([target_vector], all_vectors)[0]
    # get top recommendations
    sim_scores = sorted(enumerate(similarity_scores), key=lambda x: -x[1])[1:top_n+1]
    song_indices = [i[0] for i in sim_scores]
    top_similarity_scores = [x[1] for x in sim_scores]
    # generate explanations
    explanations = []
    input_words = df['tokenized_lyrics'].iloc[input_idx]
    for rec_idx in song_indices:
        # get recommended song vector and words
        rec_vector = df['song_vector'].iloc[rec_idx]
        rec_words = df['tokenized_lyrics'].iloc[rec_idx]

        # find top contributing words from INPUT song
        input_contributions = []
        for word in set(input_words):
            if word in word2vec_model.wv:
                word_vec = word2vec_model.wv[word]
                contribution = np.dot(word_vec, rec_vector)
                input_contributions.append((word, contribution))

        # find top contributing words from RECOMMENDED song
        rec_contributions = []
        for word in set(rec_words):
            if word in word2vec_model.wv:
                word_vec = word2vec_model.wv[word]
                contribution = np.dot(word_vec, target_vector)
                rec_contributions.append((word, contribution))

        # sort and select top keywords
        input_contributions.sort(key=lambda x: -x[1])
        rec_contributions.sort(key=lambda x: -x[1])
        top_input = [word for word, _ in input_contributions[:num_keywords]]
        top_rec = [word for word, _ in rec_contributions[:num_keywords]]
        explanations.append((top_input, top_rec))

    return df[['artist', 'song']].iloc[song_indices], explanations, top_similarity_scores

In [146]:
# get recommendations
recommendations, explanations, similarity_scores = get_recommendations_with_explanation(to_search, top_n=5)
for (idx, row), (input_words, rec_words), score in zip(recommendations.iterrows(), explanations, similarity_scores):
    print(f"Recommended Song: {row['song']} by {row['artist']}")
    print(f"  Similarity Score: {score:.4f}")
    print(f"  Key words in YOUR song: {', '.join(input_words)}")
    print(f"  Key words in THIS song: {', '.join(rec_words)}\n")

Recommended Song: Dawghaus by Devo
  Similarity Score: 0.8880
  Key words in YOUR song: gonna, know, dont, said, tell
  Key words in THIS song: baby, gonna, right, know, said

Recommended Song: Blue Nun Cafe by Judds
  Similarity Score: 0.8867
  Key words in YOUR song: gonna, know, dont, away, tell
  Key words in THIS song: baby, gonna, love, know, mama

Recommended Song: Slow Dance by Unwritten Law
  Similarity Score: 0.8851
  Key words in YOUR song: gonna, let, go, dance, yeah
  Key words in THIS song: baby, gonna, dance, love, alright

Recommended Song: Your Mama Don't Dance by Poison
  Similarity Score: 0.8778
  Key words in YOUR song: dont, gonna, let, know, go
  Key words in THIS song: gonna, dance, let, know, said

Recommended Song: Long Tall Glasses by Leo Sayer
  Similarity Score: 0.8767
  Key words in YOUR song: dance, gonna, know, dont, yeah
  Key words in THIS song: dance, right, know, said, yeah

