Spotify Recommendation Algorithm 
Steps:
1. Install required packages and dependencies and read csv file containing song data
2. Clean CSV data and convert dataframe into item-feature matrix 
3. Read spotify song playlist URL and gather playlist data from the Spotify API
4. Make sure playlist dataframe and song database dataframe have the same corresponding features (columns) 
5. Compute the cosine similarity between the playlist and database of songs to recommend songs 
6. Show recommendations to user! 

In [21]:
# Install packages and dependencies
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import spotipy
import json
from spotipy.oauth2 import SpotifyOAuth
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [23]:
# Read CSV file 
# @st.cache_data
feat_vec = pd.read_csv('data\cleaned_spotify_data.csv')
genre_list_df = pd.read_csv('data\genres_data.csv')

# Convert DataFrame to a list of rows
genre_list = genre_list_df['Genre'].tolist()

pd.set_option('display.max_columns', None)

genre_list

  feat_vec = pd.read_csv('data\cleaned_spotify_data.csv')
  genre_list_df = pd.read_csv('data\genres_data.csv')


['acoustic',
 'afrobeat',
 'alt-rock',
 'ambient',
 'black-metal',
 'blues',
 'breakbeat',
 'cantopop',
 'chicago-house',
 'chill',
 'classical',
 'club',
 'comedy',
 'country',
 'dance',
 'dancehall',
 'death-metal',
 'deep-house',
 'detroit-techno',
 'disco',
 'drum-and-bass',
 'dub',
 'dubstep',
 'edm',
 'electro',
 'electronic',
 'emo',
 'folk',
 'forro',
 'french',
 'funk',
 'garage',
 'german',
 'gospel',
 'goth',
 'grindcore',
 'groove',
 'guitar',
 'hard-rock',
 'hardcore',
 'hardstyle',
 'heavy-metal',
 'hip-hop',
 'house',
 'indian',
 'indie',
 'industrial',
 'jazz',
 'k-pop',
 'metal',
 'metalcore',
 'minimal-techno',
 'new-age',
 'opera',
 'party',
 'piano',
 'pop',
 'pop-film',
 'power-pop',
 'progressive-house',
 'psych-rock',
 'punk',
 'punk-rock',
 'rock',
 'rock-n-roll',
 'romance',
 'sad',
 'salsa',
 'samba',
 'sertanejo',
 'show-tunes',
 'singer-songwriter',
 'ska',
 'sleep',
 'songwriter',
 'soul',
 'spanish',
 'swedish',
 'tango',
 'techno',
 'trance',
 'trip-hop']

In [None]:
#connect to spotify API
# Set Spotify API credentials
client_id = 'de12bbd9fca54cdea7698229811c98dd'
client_secret = '0fbe3b5e970d46e8832f7e968da90c42'
redirect_uri = 'http://localhost:3000'

# Initialize the Spotipy client with authentication
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id, client_secret, redirect_uri))

# Retrieve playlist id from playlist link
playlist_link = 'https://open.spotify.com/playlist/6rAOaZfR3WZy1hDhxCm8HW?si=79756cdaee73455a'
playlist_id = playlist_link.split('/')[-1].split('?')[0]

# Get the first 50 songs of the playlist
playlist_tracks = sp.playlist_tracks(playlist_id, limit=50)

# Create lists to hold track titles and artist names
titles, artists, uri = [], [], []

# Iterate through the tracks and collect title, artist, and uri from each song
for item in playlist_tracks['items']:
    track = item['track']
    titles.append(track['name'])
    artist_names = ', '.join([artist['name'] for artist in track['artists']])
    artists.append(artist_names)
    uri.append(track['uri'])

# Create a DataFrame
data = {'Title': titles, 'Artist': artists, 'uri': uri}
playlist = pd.DataFrame(data)

# create new feature columns and assign null values
new_feat = ['danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
for item in new_feat:
    playlist[item] = 0
    

# fill null values with feature values 
for i in range(len(playlist)):
    track_uri = playlist.iloc[i].uri
    audio_features = sp.audio_features(track_uri)
    json_string = json.dumps(audio_features[0])
    dictionary = json.loads(json_string)
    
    #update feature values
    for feature in new_feat:
        # Explicitly cast the column to float64 before assignment
        playlist[feature] = playlist[feature].astype('float64')
        playlist.loc[i, feature] = dictionary[feature]
    
playlist

**Finding Genres of Songs in Playlist**

The Spotify API does not have genres for each song, however, they do provide genres for each artist. So for each song, I will use the artist to find the associated genre to the song. 

**Task Parallelization**

To do this, I would have to iterate through each song and use the Spotify API to find the genre to the artist of the song provided. This process would take a long amount of time. So, I used task parallelization to decrease computation times. By default, when you run a Python script, it typically utilizes a single CPU, operating on a single processing unit. To use the power of available CPUs, I integrated the Python package Joblib for parallel processing. This significantly enhances program efficiency by distributing functions across multiple CPUs, resulting in an average of 30.7% reduction in processing times.

In [None]:
# Create a list of artist names from the playlist
artist_names = playlist['Artist'].tolist()

# Create an empty list to store genres
genres = []

# Parallelization process function to iterate through artist names and retrieve genres
def process_artist(artist_name, sp):
    search_results = sp.search(q=artist_name, type='artist')
    genres_info = []

    if 'artists' in search_results and 'items' in search_results['artists']:
        artists = search_results['artists']['items']

        for artist in artists:
            if artist['name'] == artist_name:
                genres_info = artist.get('genres', [])
                break

    genre_string = ', '.join(genres_info) if genres_info else 'No Genre Found'
    return genre_string

# Fill genre for each song using Parallelization
genres = Parallel(n_jobs=-1)(delayed(process_artist)(artist_name, sp) for artist_name in artist_names)

playlist['Genre'] = genres

playlist

**Spotify's Unique Genre Names**

The genres that Spotify provides for each artist is very unique. To successfully run the cosine similarity algorithm, I need the genres of both item-feature matrices to match. To do this, I used the list of genres from the item-feature matrix and did a substring search for each of the genres to assign a binary value in the playlist item-feature matrix. 

In [None]:
# using the genre column, find substrings of genres and assign values of 1 if found
# for genre in updated_genre_list:
#     playlist['genre_' + genre] = playlist['genre'].str.contains(genre).astype(int)

# playlist = playlist.drop(columns=['genre'])

# playlist
genre_count = {}

# Substring search of genres, 
for genre in genre_list:
        # substring search for genres, assigns binary value in playlist item-feature matrix
        playlist['genre_'+genre] = playlist['Genre'].str.contains(genre).astype(int)
        # gather count of each genre in playlist
        if playlist['genre_'+genre].sum() > 0:
            genre_count[genre] = playlist['genre_'+genre].sum()
            
playlist = playlist.drop(columns=['Genre'])

#get top 3 genres for recommendation 
top_3_genres = sorted(genre_count, key=genre_count.get, reverse=True)[:3]
            
playlist


In [None]:
# Need to find the year and popularity of each song in the playlist 
    
playlist['year'] = [0]*len(playlist)
playlist['popularity'] = [0]*len(playlist)

# iterate through each song to find popularity and release year
for index, row in playlist.iterrows():
    track_uri = row['uri']
    # Get audio features of the track
    track_info = sp.track(track_uri)

    # Extract release date from track info
    release_date = track_info['album']['release_date']
    popularity = track_info['popularity']

    # Extract year from release date
    release_year = int(release_date.split('-')[0])

    playlist.loc[index, 'year'] = int(release_year)
    playlist.loc[index,'popularity'] = int(popularity)

playlist

In [None]:
# make buckets based on year to match item-feature matrix of 1M+ song database
 
# Make columns for each time period
playlist['year_2000-2004'] = playlist['year'].apply(lambda year: 1 if year>=2000 and year<2005 else 0)
playlist['year_2005-2009'] = playlist['year'].apply(lambda year: 1 if year>=2005 and year<2010 else 0)
playlist['year_2010-2014'] = playlist['year'].apply(lambda year: 1 if year>=2010 and year<2015 else 0)
playlist['year_2015-2019'] = playlist['year'].apply(lambda year: 1 if year>=2015 and year<2020 else 0)
playlist['year_2020-2023'] = playlist['year'].apply(lambda year: 1 if year>=2020 and year<2025 else 0)
 
# Drop year column, no longer needed
playlist = playlist.drop(columns=['year'])

playlist

In [None]:
# apply scaling again this time to playlist dataframe to normalize feature values 
min_row = {'popularity': '0', 'loudness': '-60', 'tempo': '0'}
max_row = {'popularity': '100', 'loudness': '0', 'tempo': '250'}

min_row_df = pd.DataFrame([min_row])
max_row_df = pd.DataFrame([max_row])

playlist = pd.concat([playlist, min_row_df], ignore_index=True)
playlist = pd.concat([playlist, max_row_df], ignore_index=True)

# scale popularity, loudness, and tempo features to 0-1
scale = ['popularity', 'loudness', 'tempo']
scaler = MinMaxScaler()
playlist[scale] = scaler.fit_transform(playlist[scale])

# drop min and max values
playlist = playlist.iloc[:-2]

playlist

In [None]:
# sort the dataframes in alphabetical order so columns correspond to each other for the cosine similarity algorithm
playlist = playlist.sort_index(axis=1)
feat_vec = feat_vec.sort_index(axis=1)

# for cosine similarity, drop track_id column of the dataframe, this is not needed and numerical values are only needed
feat_vec_cosine_sim = feat_vec.drop('track_id', axis=1)

# drop the Artist, Title, and uri in the playlist dataframe as well since they are not numerical values  
columns_dropped = ['Artist', 'Title', 'uri']
playlist_cosine_sim = playlist.drop(columns_dropped, axis=1)


In [None]:
# Calculate column averages of the playlist dataframe
column_averages = playlist_cosine_sim.mean()

# Create a new DataFrame for the averages and totals
averages_cosine_sim = pd.DataFrame([column_averages], index=['Average'])

averages_cosine_sim

In [None]:
#generate similarity scores!
similarity_scores = cosine_similarity(feat_vec_cosine_sim, averages_cosine_sim)
 
feat_vec['similarity_score'] = similarity_scores
 
#sort df from highest to lowest by similarity score and to show songs with highest similarity scores
top_similarities = feat_vec.sort_values(by='similarity_score', ascending=False)

#remove rows in recommendations from top_similarities where IDs match with playlist IDs, this makes sure that no recommendation is already in the user's playlist
top_similarities = top_similarities[~top_similarities['track_id'].isin(playlist['uri'])]

# Check if top_3_genres contains at least 3 genres
if len(top_3_genres) >= 3:
    # Get song recs from top 3 genres
    first_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[0]] == 1].head(45)
    second_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[1]] == 1].head(30)
    third_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[2]] == 1].head(15)
    
    top_similarities = pd.concat([first_genre, second_genre, third_genre], ignore_index=True)
else:
    if len(top_3_genres) == 2:
        # If there are only 2 genres, adjust the number of recommendations accordingly
        first_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[0]] == 1].head(60)
        second_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[1]] == 1].head(40)
        top_similarities = pd.concat([first_genre, second_genre], ignore_index=True)
    else:
        if len(top_3_genres) == 1:
        # If there is only 1 genre, adjust the number of recommendations accordingly
            first_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[0]] == 1].head(100)
            top_similarities = first_genre
        elif len(top_3_genres) == 0:
            # If there are no genres, get the top 100 recommendations
            top_similarities = top_similarities.head(100)

top_similarities

In [None]:
# find the track name, artist, and 30s audio preview or each song using the track_id
top_similarities['track'] = [None]*len(top_similarities)
top_similarities['artist'] = [None]*len(top_similarities)
top_similarities['preview'] = [None]*len(top_similarities)

# get track name, artist, and 30s audio clip url
# Find the track name, artist, and 30s audio preview for each song using the track_id
for index, row in top_similarities.iterrows():
    track_info = sp.track(row['track_id'])
    track_name = track_info['name']
    artist_name = track_info['artists'][0]['name']
    preview_url = track_info['preview_url']
    
    # Update DataFrame with track information
    top_similarities.at[index, 'track'] = track_name
    top_similarities.at[index, 'artist'] = artist_name
    top_similarities.at[index, 'preview'] = preview_url



In [None]:
# Check if top_similarities DataFrame is not empty
if not top_similarities.empty:
    # Drop rows with missing values in track, artist, or preview columns
    top_similarities.dropna(subset=['track', 'artist', 'preview'], inplace=True)
    
    # Iterate over DataFrame rows and print track information
    for index, row in top_similarities.iterrows():
        track_name = row['track']
        artist_name = row['artist']
        preview_url = row['preview']
        print(track_name, "| " + artist_name, "| ", preview_url)
else:
    print("Error: top_similarities DataFrame is empty.")


In [None]:
# Get genres of each track in playlist
artist_names = top_similarities['artist'].tolist()
    
# Create an empty list to store genres
genres = []

# Fill genre for each song using Parallelization
genres = Parallel(n_jobs=1)(delayed(process_artist)(artist_name, sp) for artist_name in artist_names)

# Add genres to the dataframe
top_similarities['genre'] = genres

# if songs in recs have any ethnic songs
ethnic_genres = ['colombia', 'latin', 'mexican', 'puerto rican', 'dominican', 'italian', 'spanish', 'brasil', 'argentine', 'anime', 'japanese', 'indonesian', 'vietnamese', 'korean', 'chinese', 'taiwan', 'spanish']
    
# remove any songs that have ethnic genres included
mask = top_similarities['genre'].str.contains('|'.join(ethnic_genres), case=False)
top_similarities.drop(top_similarities[mask].index, inplace=True)

# Check if top_3_genres contains at least 3 genres
if len(top_3_genres) >= 3:
    # Get song recs from top 3 genres
    first_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[0]] == 1].head(5)
    second_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[1]] == 1].head(3)
    third_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[2]] == 1].head(2)

    top_similarities = pd.concat([first_genre, second_genre, third_genre], ignore_index=True)
else:
    if len(top_3_genres) == 2:
        # If there are only 2 genres, adjust the number of recommendations accordingly
        first_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[0]] == 1].head(6)
        second_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[1]] == 1].head(4)

        top_similarities = pd.concat([first_genre, second_genre], ignore_index=True)
    elif len(top_3_genres) == 1:
        # If there is only 1 genre, adjust the number of recommendations accordingly
        first_genre = top_similarities.loc[top_similarities['genre_'+top_3_genres[0]] == 1].head(10)

        top_similarities = first_genre
    
top_similarities

In [None]:
#show only specific columns useful to the user 
display_features = ['track', 'artist', 'similarity_score', 'genre', 'preview']

playlist_recs = top_similarities[display_features]

playlist_recs['similarity_score'] = (playlist_recs['similarity_score']*100).round(2)

playlist_recs