Importing all the necessary libraries and set up the Spotify API connection.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import json
import re 
import sys
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display, Image, HTML

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

client_id = '787e2003dd06462e9e3faff275a8f4a0'
client_secret = 'e70e212396e345648fe6555e05562a24'
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
spotify_client = spotipy.Spotify(auth_manager=auth_manager)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Reading the datasets from csv files and understanding the variables (as columns).

In [2]:
artist_data_df = pd.read_csv("data_by_artist.csv")
genre_data_df = pd.read_csv("data_by_genres.csv")
artist_genre_df = pd.read_csv("data_w_genres.csv")
year_data_df = pd.read_csv("data_by_year.csv")
song_data_df = pd.read_csv("data.csv")

In [3]:
print(song_data_df.columns)
print(artist_data_df.columns)
print(year_data_df.columns)
print(artist_genre_df.columns)
print(genre_data_df.columns)

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo'],
      dtype='object')
Index(['mode', 'count', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'popularity', 'key'],
      dtype='object')
Index(['mode', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'popularity', 'key'],
      dtype='object')
Index(['genres', 'artists', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'popularity', 'key', 'mode', 'count'],
      dtype='object')
Index(['mode', 'genres', 'acousticness

In [4]:
genre_data_df.shape

(2973, 14)

In [5]:
genre_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2973 entries, 0 to 2972
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              2973 non-null   int64  
 1   genres            2973 non-null   object 
 2   acousticness      2973 non-null   float64
 3   danceability      2973 non-null   float64
 4   duration_ms       2973 non-null   float64
 5   energy            2973 non-null   float64
 6   instrumentalness  2973 non-null   float64
 7   liveness          2973 non-null   float64
 8   loudness          2973 non-null   float64
 9   speechiness       2973 non-null   float64
 10  tempo             2973 non-null   float64
 11  valence           2973 non-null   float64
 12  popularity        2973 non-null   float64
 13  key               2973 non-null   int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 325.3+ KB


In [6]:
artist_genre_df['genres'].values[0]

"['show tunes']"

As we can see, the values are string values that appears to be in the form of a list. We create a Regex Statement that will turn the string (that looks like a list) into the datatype of a list.

In [7]:
artist_genre_df['genres_upd'] = artist_genre_df['genres'].apply(lambda x: [re.sub(' ','_',i) for i in re.findall(r"'([^']*)'", x)])
artist_genre_df['genres_upd'].values[0][0]

'show_tunes'

We do the same for artists.

In [8]:
song_data_df['artists_upd_1'] = song_data_df['artists'].apply(lambda x: re.findall(r"'([^']*)'", x))
song_data_df['artists_upd_1'].values[0][0]

'Sergei Rachmaninoff'

However, we have to keep in mind some artists have an apostrophe in their title. We write another Regex Statement to deal with this.

In [9]:
song_data_df['artists_upd_2'] = song_data_df['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))

We then combine both the Regex Statements, i.e. *artists_upd_1* and *artists_upd_2* to *artists_upd*.

In [10]:
song_data_df['artists_upd'] = np.where(song_data_df['artists_upd_1'].apply(lambda x: not x), song_data_df['artists_upd_2'], song_data_df['artists_upd_1'] )

Now, we need to create our own song identifier because there are duplicates with different ids. 

In [11]:
song_data_df['artists_song'] = song_data_df.apply(lambda row: row['artists_upd'][0]+row['name'],axis = 1)

In [12]:
song_data_df.sort_values(['artists_song','release_date'], ascending = False, inplace = True)

In [13]:
song_data_df[song_data_df['name']=='Adore You']

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,mode,name,popularity,release_date,speechiness,tempo,artists_upd_1,artists_upd_2,artists_upd,artists_song
19425,0.569,2019,0.0237,['Harry Styles'],0.676,207133,0.771,0,3jjujdWJ72nww5eGnfs2E7,7e-06,...,1,Adore You,88,2019-12-13,0.0483,99.048,[Harry Styles],[],[Harry Styles],Harry StylesAdore You
38319,0.569,2019,0.0237,['Harry Styles'],0.676,207133,0.771,0,1M4qEo4HE3PRaCOM7EXNJq,7e-06,...,1,Adore You,77,2019-12-06,0.0483,99.048,[Harry Styles],[],[Harry Styles],Harry StylesAdore You


We drop duplicates.

In [14]:
song_data_df.drop_duplicates('artists_song',inplace = True)
song_data_df[song_data_df['name']=='Adore You']

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,mode,name,popularity,release_date,speechiness,tempo,artists_upd_1,artists_upd_2,artists_upd,artists_song
19425,0.569,2019,0.0237,['Harry Styles'],0.676,207133,0.771,0,3jjujdWJ72nww5eGnfs2E7,7e-06,...,1,Adore You,88,2019-12-13,0.0483,99.048,[Harry Styles],[],[Harry Styles],Harry StylesAdore You


We explode this column so each artist of a song will have a unique row and merge *data_w_genre* (*artist_genre_df*).

In [15]:
artists_exp = song_data_df[['artists_upd','id']].explode('artists_upd')
artists_exp_enriched = artists_exp.merge(artist_genre_df, how = 'left', left_on = 'artists_upd',right_on = 'artists')
artists_exp_enriched_nonnull = artists_exp_enriched[~artists_exp_enriched.genres_upd.isnull()]

Next, we Group by the song id and essentially create "lists" lists and then, sonsilidate these lists to output the unique values.

In [16]:
artists_genres = artists_exp_enriched_nonnull.groupby('id')['genres_upd'].apply(list).reset_index()
artists_genres['genre_lists'] = artists_genres['genres_upd'].apply(lambda x: list(set(list(itertools.chain.from_iterable(x)))))
artists_genres.head()

Unnamed: 0,id,genres_upd,genre_lists
0,000G1xMMuwxNHmwVsBdtj1,"[[candy_pop, dance_rock, new_wave, new_wave_po...","[dance_rock, new_wave_pop, permanent_wave, new..."
1,000GyYHG4uWmlXieKLij8u,"[[alternative_hip_hop, conscious_hip_hop, minn...","[conscious_hip_hop, alternative_hip_hop, pop_r..."
2,000Npgk5e2SgwGaIsN3ztv,"[[classic_bollywood, classic_pakistani_pop, fi...","[filmi, classic_pakistani_pop, classic_bollywo..."
3,000ZxLGm7jDlWCHtcXSeBe,"[[boogie-woogie, piano_blues, ragtime, stride]]","[boogie-woogie, piano_blues, ragtime, stride]"
4,000jBcNljWTnyjB4YO7ojf,[[]],[]


In [17]:
song_data_df = song_data_df.merge(artists_genres[['id','genre_lists']], on = 'id',how = 'left')

Now, we perform some Feature Engineering by:
- Normalizing float variables
- OHE of Year and Popularity Variables
- Create TF-IDF features from artist genres

In [18]:
song_data_df.tail(3)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,name,popularity,release_date,speechiness,tempo,artists_upd_1,artists_upd_2,artists_upd,artists_song,genre_lists
156604,0.697,1999,0.0516,"[""Ol' Dirty Bastard"", 'Kelis', 'Rich Travali']",0.934,239547,0.459,1,6YYd5MLpu45J0uLrMdivF7,0.0,...,Got Your Money (feat. Kelis),66,1999,0.189,103.04,"[ Dirty Bastard"", , , ]",[Ol' Dirty Bastard],"[ Dirty Bastard"", , , ]","Dirty Bastard"", Got Your Money (feat. Kelis)",
156605,0.429,1994,0.0249,"[""World Class Wreckin' Cru"", ""Michel 'Le""]",0.715,351040,0.49,0,3hoiinUc5VA9xUEJID7R8V,0.00017,...,Turn Off The Lights - Rap,36,1994-04-06,0.0479,129.309,"[ Cru"", ""Michel ]","[World Class Wreckin' Cru, Michel 'Le]","[ Cru"", ""Michel ]","Cru"", ""Michel Turn Off The Lights - Rap",
156606,0.273,1996,0.0113,"[""Rappin' 4-Tay"", 'MC Breed', 'Too $hort']",0.897,337973,0.414,1,78859Af0fmA9VTlgnOHTAP,0.00011,...,Never Talk Down,35,1996,0.246,96.039,"[ 4-Tay"", , , ]",[Rappin' 4-Tay],"[ 4-Tay"", , , ]","4-Tay"", Never Talk Down",


In [19]:
song_data_df['year'] = song_data_df['release_date'].apply(lambda x: x.split('-')[0])
float_cols = song_data_df.dtypes[song_data_df.dtypes == 'float64'].index.values
ohe_cols = 'popularity'
song_data_df['popularity'].describe()

count    156607.000000
mean         31.307215
std          21.712234
min           0.000000
25%          11.000000
50%          33.000000
75%          48.000000
max         100.000000
Name: popularity, dtype: float64

In [20]:
song_data_df['popularity_red'] = song_data_df['popularity'].apply(lambda x: int(x/5))
# tfidf can't handle nulls so fill any null values with an empty list
song_data_df['genre_lists'] = song_data_df['genre_lists'].apply(lambda d: d if isinstance(d, list) else [])
song_data_df.tail()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,popularity,release_date,speechiness,tempo,artists_upd_1,artists_upd_2,artists_upd,artists_song,genre_lists,popularity_red
156602,0.768,1997,0.282,"[""Lil' Kim"", ""Lil' Cease""]",0.748,275947,0.693,0,2LP2uDQQ7eLMcUVE4aOpAV,0.0,...,56,1997-06-30,0.278,88.802,"[ Kim"", ""Lil]","[Lil' Kim, Lil' Cease]","[ Kim"", ""Lil]","Kim"", ""LilCrush on You (feat. Lil' Cease) - R...",[],11
156603,0.792,2004,0.0248,"[""Lil' Flip"", 'Lea']",0.814,225173,0.387,1,4s0o8TJHfX9LLHa0umnOzT,0.0,...,62,2004-03-30,0.0945,93.961,"[ Flip"", ]",[Lil' Flip],"[ Flip"", ]","Flip"", Sunshine (feat. Lea)",[],12
156604,0.697,1999,0.0516,"[""Ol' Dirty Bastard"", 'Kelis', 'Rich Travali']",0.934,239547,0.459,1,6YYd5MLpu45J0uLrMdivF7,0.0,...,66,1999,0.189,103.04,"[ Dirty Bastard"", , , ]",[Ol' Dirty Bastard],"[ Dirty Bastard"", , , ]","Dirty Bastard"", Got Your Money (feat. Kelis)",[],13
156605,0.429,1994,0.0249,"[""World Class Wreckin' Cru"", ""Michel 'Le""]",0.715,351040,0.49,0,3hoiinUc5VA9xUEJID7R8V,0.00017,...,36,1994-04-06,0.0479,129.309,"[ Cru"", ""Michel ]","[World Class Wreckin' Cru, Michel 'Le]","[ Cru"", ""Michel ]","Cru"", ""Michel Turn Off The Lights - Rap",[],7
156606,0.273,1996,0.0113,"[""Rappin' 4-Tay"", 'MC Breed', 'Too $hort']",0.897,337973,0.414,1,78859Af0fmA9VTlgnOHTAP,0.00011,...,35,1996,0.246,96.039,"[ 4-Tay"", , , ]",[Rappin' 4-Tay],"[ 4-Tay"", , , ]","4-Tay"", Never Talk Down",[],7


With the function below, we create One Hot Encoded features of a specific column.


In [21]:

def ohe_prep(df, column, new_name): 
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

We process song_data_df to create a final set of features used to generate recommendations. This is the **Feature Set**

In [22]:
# Updated create_feature_set function with ohe_prep integration
def create_feature_set(df, float_cols):
    tfidf = TfidfVectorizer()
    # The lambda function assumes each entry in 'genre_lists' is a list; adjust if necessary
    tfidf_matrix = tfidf.fit_transform(df['genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    try:
        feature_names = tfidf.get_feature_names_out()
    except AttributeError:
        feature_names = tfidf.get_feature_names()
    genre_df.columns = ['genre' + "|" + i for i in feature_names]
    genre_df.reset_index(drop=True, inplace=True)

    # Applying ohe_prep for 'year' and 'popularity_red'
    year_ohe = ohe_prep(df, 'year', 'year') * 0.25
    popularity_ohe = ohe_prep(df, 'popularity_red', 'pop') * 0.1

    # Scale float columns
    floats = df[float_cols].reset_index(drop=True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns=floats.columns) * 0.2


    final = pd.concat([genre_df, floats_scaled, popularity_ohe, year_ohe], axis=1)
    
    final['id'] = df['id'].values
    
    return final


In [23]:
tfidf = TfidfVectorizer()  # Adjust max_features as needed
tfidf_matrix = tfidf.fit_transform(song_data_df['genre_lists'].apply(lambda x: " ".join(x)))


In [24]:
complete_feature_set = create_feature_set(song_data_df, float_cols=float_cols)

In [25]:
complete_feature_set.head(5)

Unnamed: 0,genre|21st_century_classical,genre|432hz,genre|_hip_hop,genre|a_cappella,genre|abstract,genre|abstract_beats,genre|abstract_hip_hop,genre|accordeon,genre|accordion,genre|acid_house,...,year|2012,year|2013,year|2014,year|2015,year|2016,year|2017,year|2018,year|2019,year|2020,id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2ghebdwe2pNXT4eL34T7pW
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3KIuCzckjdeeVuswPo20mC
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4prhqrLXYMjHJ6vpRAlasx
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5xFXTvnEe03SyvFpo6pEaE
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6Pqs2suXEqCGx7Lxg5dlrB


In here we ask for their year wrapped playlist for profiling.

In [41]:
#https://open.spotify.com/playlist/3Qtj7yKpmowElQMBNdKA6o?si=90JlHMozTteez6lwUFGeZQ example link
playlist_url = input("Enter the Playlist Link: \n")
playlist = spotify_client.playlist(playlist_url)

total_songs = playlist["tracks"]["total"]
song_details = []

tracks = playlist["tracks"]
items = tracks["items"]
offset = 0

for index in range(total_songs):
    current_item = items[index - offset]
    track_id = current_item["track"]["id"]
    song_title = current_item["track"]["name"]
    artists = ', '.join([artist["name"] for artist in current_item["track"]["artists"]])
    date_added = current_item["added_at"]
    
    song_details.append({"artists": artists, "name": song_title, "id": track_id, "date_added": date_added})
    
    # Check if we need to fetch the next set of tracks
    if (index + 1) % 100 == 0 and (index + 1) < total_songs:
        tracks = spotify_client.next(tracks)
        items = tracks["items"]
        offset = index + 1


playlist = pd.DataFrame(song_details)


playlist


Unnamed: 0,artists,name,id,date_added
0,"Arijit Singh, Chinmayi",Mast Magan,3uL1IBFhg52VcQqOwAG01E,2019-07-27T18:37:11Z
1,Taylor Swift,Wildest Dreams,3fVnlF4pGqWI9flVENcT28,2019-07-27T18:37:19Z
2,,,4JQxGJ8jsKW8pOUVoAgOZr,2019-07-27T18:37:26Z
3,Prateek Kuhad,cold/mess,4Psh3fEnAMftNPOTsAHPgG,2019-07-27T18:37:39Z
4,Lauv,I Like Me Better,1wjzFQodRWrPcQ0AnYnvQ9,2019-07-27T18:37:51Z
...,...,...,...,...
265,"Rovalio, Abdul Hannan",Iraaday,6qrifdo7QINdPQr80IelGi,2023-10-21T06:16:16Z
266,The Local Train,Choo Lo,2qgXrzJsry4KgYoJCpuaul,2023-10-21T06:16:21Z
267,Mohit Chauhan,Yun Hi,4EHa33GDXoVWGXYOO3cEGm,2023-10-28T18:47:27Z
268,"Rahul Ram, Amit Kilam, Himanshu Joshi",Mann Kasturi,1azBh5Mmv7p01LNM3iZhn9,2024-03-11T18:23:39Z



We summarise the give spotify playlist into a vector.


In [42]:
def generate_playlist_feature(complete_feature_set, playlist_df, weight_factor):
    
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['id'].isin(playlist_df['id'].values)]
    complete_feature_set_playlist = complete_feature_set_playlist.merge(playlist_df[['id','date_added']], on = 'id', how = 'inner')
    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['id'].isin(playlist_df['id'].values)]
    
    playlist_feature_set = complete_feature_set_playlist.sort_values('date_added',ascending=False)

    most_recent_date = playlist_feature_set.iloc[0,-1]
    
    for ix, row in playlist_feature_set.iterrows():
        playlist_feature_set.loc[ix,'months_from_recent'] = int((most_recent_date.to_pydatetime() - row.iloc[-1].to_pydatetime()).days / 30)
        
    playlist_feature_set['weight'] = playlist_feature_set['months_from_recent'].apply(lambda x: weight_factor ** (-x))
    
    playlist_feature_set_weighted = playlist_feature_set.copy()

    playlist_feature_set_weighted.update(playlist_feature_set_weighted.iloc[:,:-4].mul(playlist_feature_set_weighted.weight,0))
    playlist_feature_set_weighted_final = playlist_feature_set_weighted.iloc[:, :-4]
    
    return playlist_feature_set_weighted_final.sum(axis = 0), complete_feature_set_nonplaylist

In [43]:
playlist['date_added'] = pd.to_datetime(playlist['date_added'])
complete_feature_set_playlist_vector, complete_feature_set_nonplaylist = generate_playlist_feature(complete_feature_set, playlist, 1.09)
complete_feature_set_playlist_vector.head(10)

genre|21st_century_classical    0.00000
genre|432hz                     0.00000
genre|_hip_hop                  0.51074
genre|a_cappella                0.00000
genre|abstract                  0.00000
genre|abstract_beats            0.00000
genre|abstract_hip_hop          0.00000
genre|accordeon                 0.00000
genre|accordion                 0.00000
genre|acid_house                0.00000
dtype: float64

We create a main genre list to categorize the subgenres as we should be searching for songs as per their main genre.

In [44]:
genre_list = [
    "Pop", "Rock", "Indie", "Metal", "Classical", "Hip Hop", "Rap", "Folk", "Jazz", "Punk", "House", "Blues",
    "Soul", "Country", "Funk", "Reggae", "Techno", "Latin", "R&B", "Trance", "Ambient", "EDM",
    "Ska", "Gospel", "Disco", "Opera", "Grime", "Bass", "Drum"
]

def get_main_genres(genre_sublist):
    main_genres = []
    for subgenre in genre_sublist:
        for main_genre in genre_list:
            if main_genre.lower() in subgenre.lower():
                # Add the main genre to the list if not already included
                if main_genre not in main_genres:
                    main_genres.append(main_genre)
    return main_genres if main_genres else ["Other"]

song_data_df['Main_Genre'] = song_data_df['genre_lists'].apply(get_main_genres)

song_data_df.tail(3)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,release_date,speechiness,tempo,artists_upd_1,artists_upd_2,artists_upd,artists_song,genre_lists,popularity_red,Main_Genre
156604,0.697,1999,0.0516,"[""Ol' Dirty Bastard"", 'Kelis', 'Rich Travali']",0.934,239547,0.459,1,6YYd5MLpu45J0uLrMdivF7,0.0,...,1999,0.189,103.04,"[ Dirty Bastard"", , , ]",[Ol' Dirty Bastard],"[ Dirty Bastard"", , , ]","Dirty Bastard"", Got Your Money (feat. Kelis)",[],13,[Other]
156605,0.429,1994,0.0249,"[""World Class Wreckin' Cru"", ""Michel 'Le""]",0.715,351040,0.49,0,3hoiinUc5VA9xUEJID7R8V,0.00017,...,1994-04-06,0.0479,129.309,"[ Cru"", ""Michel ]","[World Class Wreckin' Cru, Michel 'Le]","[ Cru"", ""Michel ]","Cru"", ""Michel Turn Off The Lights - Rap",[],7,[Other]
156606,0.273,1996,0.0113,"[""Rappin' 4-Tay"", 'MC Breed', 'Too $hort']",0.897,337973,0.414,1,78859Af0fmA9VTlgnOHTAP,0.00011,...,1996,0.246,96.039,"[ 4-Tay"", , , ]",[Rappin' 4-Tay],"[ 4-Tay"", , , ]","4-Tay"", Never Talk Down",[],7,[Other]


Get songs from the playlists.

In [45]:
def generate_playlist_recos_batched(df, features, nonplaylist_features, batch_size=100): #remove mach_size
    recommendations = pd.DataFrame()
    
    total_rows = nonplaylist_features.shape[0]
    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        batch = nonplaylist_features.iloc[start:end]
        sim = cosine_similarity(batch.drop('id', axis=1).values, features.values.reshape(1, -1))[:, 0]
        
        batch_df_ids = df[df['id'].isin(batch['id'].values)]
        if not batch_df_ids.empty:
            
            valid_ids = batch_df_ids['id'].values
            valid_indices = [i for i, id_val in enumerate(batch['id'].values) if id_val in valid_ids]
            
            valid_sim = sim[valid_indices]
            batch_df_ids['sim'] = valid_sim
            
            recommendations = pd.concat([recommendations, batch_df_ids], ignore_index=True)
    
    if not recommendations.empty:
        top_recs = recommendations.sort_values('sim', ascending=False).head(3)
        return top_recs
    else:
        return pd.DataFrame()  


Filter our df according to the genre (main), the user wants to start exploring i.e. want recommendations for.

In [46]:
genre = input("What genre do you want to get into?")
df = song_data_df[song_data_df['Main_Genre'].apply(lambda x:genre in x)]  #IMP.

In [47]:
top3 = generate_playlist_recos_batched(df, complete_feature_set_playlist_vector, complete_feature_set_nonplaylist)

We now have the top3 genres that compliments the entered playlist. Therefore, the recommedation, based on a particular genre, have been successfully generated.

In [48]:
top3

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,speechiness,tempo,artists_upd_1,artists_upd_2,artists_upd,artists_song,genre_lists,popularity_red,Main_Genre,sim
6017,0.648,2017,0.0135,['Carly Rae Jepsen'],0.71,207960,0.909,0,6EJiVf7U0p1BBfs0qqeb1f,0.00123,...,0.0639,115.001,[Carly Rae Jepsen],[],[Carly Rae Jepsen],Carly Rae JepsenCut To The Feeling,"[electropop, pop, canadian_pop, indie_poptimis...",13,"[Pop, Indie]",0.615428
6018,0.66,2012,0.0114,['Carly Rae Jepsen'],0.783,193400,0.58,0,3TGRqZ0a2l1LRblBkJoaDx,2e-06,...,0.0408,120.021,[Carly Rae Jepsen],[],[Carly Rae Jepsen],Carly Rae JepsenCall Me Maybe,"[electropop, pop, canadian_pop, indie_poptimis...",13,"[Pop, Indie]",0.597108
6014,0.135,2012,0.00549,['Carly Rae Jepsen'],0.72,219213,0.776,0,471JtpRQ0oox9OoZcbB8OO,0.000298,...,0.043,125.96,[Carly Rae Jepsen],[],[Carly Rae Jepsen],Carly Rae JepsenTonight I’m Getting Over You,"[electropop, pop, canadian_pop, indie_poptimis...",8,"[Pop, Indie]",0.59402


Here are the recommendations visualised as a 'pseudo' html interface to depict how they would be ideally communicated to the users.

In [49]:

def display_album_details(track_id):
    track_details = spotify_client.track(track_id)
    album_cover_url = track_details['album']['images'][0]['url']  
    song_name = track_details['name']
    artist_name = track_details['artists'][0]['name']  
    
    html = f"""
    <div style="float: left; padding: 20px;">
        <img src="{album_cover_url}" style="width: 150px; height: 150px; border-radius: 5%;"><br>
        <b>{song_name}</b><br>
        <i>{artist_name}</i>
    </div>
    """
    display(HTML(html))

print(f"These are your recommendations for {genre.capitalize()}:")
for track_id in top3['id']:
    display_album_details(track_id)



These are your recommendations for Indie:
