## Import libraries

In [1]:
import pandas as pd 
import numpy as np 
from warnings import filterwarnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import minmax_scale
from sklearn.metrics.pairwise import cosine_similarity
from cleantext import clean

In [2]:
filterwarnings('ignore')

## Import data

In [3]:
data = pd.read_csv('./song_data/spotify_songs.csv')
data.head(3)

Unnamed: 0,Track URI,Track Name,Artist URI(s),Artist Name(s),Album URI,Album Name,Album Artist URI(s),Album Artist Name(s),Album Release Date,Album Image URL,...,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Album Genres,Label,Copyrights
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,Justified & Ancient - Stand by the Jams,spotify:artist:6dYrdRlNZSKaVxYg5IrvCH,The KLF,spotify:album:4MC0ZjNtVP1nDD5lsLxFjc,Songs Collection,spotify:artist:6dYrdRlNZSKaVxYg5IrvCH,The KLF,1992-08-03,https://i.scdn.co/image/ab67616d0000b27355346b...,...,0.048,0.0158,0.112,0.408,0.504,111.458,4.0,,Jams Communications,"C 1992 Copyright Control, P 1992 Jams Communic..."
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,I Know You Want Me (Calle Ocho),spotify:artist:0TnOYISbd1XYRBk9myaseg,Pitbull,spotify:album:5xLAcbvbSAlRtPXnKkggXA,Pitbull Starring In Rebelution,spotify:artist:0TnOYISbd1XYRBk9myaseg,Pitbull,2009-10-23,https://i.scdn.co/image/ab67616d0000b27326d73a...,...,0.149,0.0142,2.1e-05,0.237,0.8,127.045,4.0,,Mr.305/Polo Grounds Music/J Records,"P (P) 2009 RCA/JIVE Label Group, a unit of Son..."
2,spotify:track:70XtWbcVZcpaOddJftMcVi,From the Bottom of My Broken Heart,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,spotify:album:3WNxdumkSMGMJRhEgK80qx,...Baby One More Time (Digital Deluxe Version),spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,1999-01-12,https://i.scdn.co/image/ab67616d0000b2738e4986...,...,0.0305,0.56,1e-06,0.338,0.706,74.981,4.0,,Jive,P (P) 1999 Zomba Recording LLC


## Data preprocessing

- Selecting relevant columns

In [4]:
cols = ["Track URI", "Artist URI(s)", "Album URI", "Album Artist URI(s)", "Album Artist Name(s)", "Disc Number", "Track Number", "Track Preview URL", "ISRC", "Added By", "Added At", "Time Signature", "Album Genres", "Copyrights"]

In [5]:
data = data.drop(columns = cols)
data.sample(2)

Unnamed: 0,Track Name,Artist Name(s),Album Name,Album Release Date,Album Image URL,Track Duration (ms),Explicit,Popularity,Artist Genres,Danceability,...,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Label
1067,Words Of Love,The Mamas & The Papas,The Mamas & The Papas,1966-08-30,https://i.scdn.co/image/ab67616d0000b27319ab57...,135893,False,0,"classic rock,folk,folk rock,mellow gold,psyche...",0.439,...,8.0,-6.626,0.0,0.045,0.529,0.0,0.215,0.627,115.904,Universal Music Group
2744,Back To You - From 13 Reasons Why – Season 2 S...,Selena Gomez,Back To You (From 13 Reasons Why – Season 2 So...,2018-05-10,https://i.scdn.co/image/ab67616d0000b27330885d...,207904,False,77,"pop,post-teen pop",0.601,...,6.0,-4.856,1.0,0.0486,0.0945,2e-06,0.12,0.508,102.061,UMGRI Interscope


- Removing NA values

In [6]:
data.isnull().sum()

Track Name               1
Artist Name(s)           1
Album Name               1
Album Release Date       2
Album Image URL          4
Track Duration (ms)      0
Explicit                 0
Popularity               0
Artist Genres          550
Danceability             2
Energy                   2
Key                      2
Loudness                 2
Mode                     2
Speechiness              2
Acousticness             2
Instrumentalness         2
Liveness                 2
Valence                  2
Tempo                    2
Label                    6
dtype: int64

In [7]:
data.dropna(inplace = True)

- Removing duplicate values

In [8]:
print(f"Total number of duplicated items: {data.duplicated().sum()}")

Total number of duplicated items: 47


In [9]:
data.drop_duplicates(keep = 'first', inplace = True)

- Keeping the Album Image URLs for later purpose

In [10]:
album_img_urls = data['Album Image URL'].values
data.drop(columns = ['Album Image URL'], inplace = True)

- Selecting only the year from the album release date

In [11]:
data['Album Release Year'] = data['Album Release Date'].apply(pd.Timestamp).dt.year
data.drop(columns = 'Album Release Date', inplace = True)

- Processing the text columns

In [12]:
data.columns

Index(['Track Name', 'Artist Name(s)', 'Album Name', 'Track Duration (ms)',
       'Explicit', 'Popularity', 'Artist Genres', 'Danceability', 'Energy',
       'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness',
       'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Label',
       'Album Release Year'],
      dtype='object')

In [13]:
data["Artist Name(s)"] = data["Artist Name(s)"].apply(clean)

In [14]:
data["Artist Name(s)"] = data["Artist Name(s)"].str.split(",")

In [15]:
data["Track Name"] = data["Track Name"].apply(clean).apply(lambda x: [x])

In [16]:
data["Album Name"] = data["Album Name"].apply(clean).apply(lambda x: [x])

In [17]:
data["Artist Genres"] = data["Artist Genres"].str.split(",").apply(lambda x: [i.replace(" ","") for i in x])

In [18]:
data["Label"] = data["Label"].apply(clean).str.split("/").apply(lambda x: [i.replace(" ","") for i in x])

- Merging all the text columns (list)

In [19]:
text_cols = data["Track Name"] + data["Album Name"] + data["Artist Name(s)"] + data["Artist Genres"] + data["Label"]

- Joining them to make the joined lists to texts

In [20]:
text_cols = text_cols.apply(lambda x: " ".join(x))

- Removing numbers from the texts

In [21]:
text_cols = text_cols = text_cols.apply(lambda x: clean(x, no_numbers=True, replace_with_number=""))

In [22]:
count_vect = CountVectorizer(stop_words = 'english', min_df = 3, max_df = 0.8)
text_vect = count_vect.fit_transform(text_cols)

In [23]:
text_vect_array = minmax_scale(text_vect.A)   # Converting the text vector sparse matrix to array and normalizing it

- Converting Explicit column from boolean to int

In [24]:
data["Explicit"] = data["Explicit"].map({
    False:0,
    True:1
})

- Selecting the numerical song attributes

In [25]:
song_attrs = data.select_dtypes(exclude='object').values
song_attrs = minmax_scale(song_attrs)    # This is important because some of the column may have less variance and some column has high variance and scale, Normalization ensures each column or feature has same scale

In [26]:
song_attrs.shape

(9398, 15)

- Finally merging the text vectors and song attributes to get the complete vector representations of the songs

In [27]:
song_vect = np.concatenate([text_vect_array, song_attrs], axis = 1)

In [28]:
song_vect.shape

(9398, 4508)

- Finding similarities between the songs

In [29]:
similarities = cosine_similarity(song_vect)

- Making a list of songs

In [30]:
songs = data["Track Name"].apply(lambda x: " ".join(x)).values
songs

array(['justified & ancient - stand by the jams',
       'i know you want me (calle ocho)',
       'from the bottom of my broken heart', ...,
       "groovejet (if this ain't love) [feat. sophie ellis-bextor]",
       'lay low', 'padam padam'], dtype=object)

- Creating needed dictionaries by zipping song indices with song names

In [31]:
song_dict1 = dict(zip(songs, np.arange(songs.shape[0])))    # song : index
song_dict2 = dict(zip(np.arange(songs.shape[0]), songs))    # index : song

- Recommendation function

In [47]:
def recommend(song):
    songs = []
    urls = []
    song = song.lower()
    song_ind = song_dict1[song]
    top_5_matches = np.argsort(similarities[song_ind])[-6:-1]   # Last one is the song itself, so, dont take it
    for i in top_5_matches:
        songs.append(song_dict2[i])
        urls.append(album_img_urls[i])
    return songs, urls

In [48]:
recommend("lay low")

(['god is a dancer (with mabel)', '10:35', 'ritual', '10:35', 'the business'],
 ['https://i.scdn.co/image/ab67616d0000b2731e3f58e2a9d74333ba8f2392',
  'https://i.scdn.co/image/ab67616d0000b273cf8c47967e5c6bbc7dca5abb',
  'https://i.scdn.co/image/ab67616d0000b273211bcd4f50464c15d7c7f111',
  'https://i.scdn.co/image/ab67616d0000b273999565cd8bea3f8f0985bb31',
  'https://i.scdn.co/image/ab67616d0000b273f461bbc21a9bcec43a926973'])

- Exporting all the necessary files

In [51]:
import joblib

* All the objects are stored in saved_objects folder

In [52]:
joblib.dump(album_img_urls, "image_urls.pkl")
joblib.dump(similarities, "song_similarities.pkl")
joblib.dump(song_dict1, "song_dict1.pkl")
joblib.dump(song_dict2, "song_dict2.pkl")

['song_dict2.pkl']