# Spotify Recommendation System

#### This file contains the actual recommemdation algorithm

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import os
import json
from dotenv import load_dotenv
load_dotenv()

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

## Loading and Visualizing the Data

### Populate Tracks DataFrame from CSV

In [2]:
# Read 'tracks_transformed.csv' into a DataFrame: tracks_df
tracks_df = pd.read_csv('data/tracks_transformed.csv', na_filter=False)
tracks_df.head()

Unnamed: 0,id,name,artists,id_artists,genres,release_year,duration_s,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,35iwgR4jXetI318WEWsa1Q,Carve,Uli,['45tIt06XoI0Iio4LBEVpls'],,1922,126.903,6,0.645,0.445,0,46.662,1,0.451,0.674,0.744,0.151,0.127,104.851
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,Fernando Pessoa,['14jtPCOoNZwquk5wd9DxrY'],,1922,98.2,0,0.695,0.263,0,37.864,1,0.957,0.797,0.0,0.148,0.655,102.009
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,Ignacio Corsini,['5LiOoJbxVSAMkBS2fUm3X2'],tango vintage tango,1922,181.64,0,0.434,0.177,1,38.82,1,0.0512,0.994,0.0218,0.212,0.457,130.418
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,Ignacio Corsini,['5LiOoJbxVSAMkBS2fUm3X2'],tango vintage tango,1922,176.907,0,0.321,0.0946,7,32.039,1,0.0504,0.995,0.918,0.104,0.397,169.98
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,Dick Haymes,['3BiJGZsyX9sJchTqcSA7Su'],adult standards big band easy listening lounge...,1922,163.08,0,0.402,0.158,3,43.1,0,0.039,0.989,0.13,0.311,0.196,103.22


In [3]:
tracks_df.describe()

Unnamed: 0,release_year,duration_s,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0
mean,1988.566168,230.051167,27.570053,0.563594,0.542036,5.221603,49.793933,0.658797,0.104864,0.449863,0.113451,0.213935,0.552292,118.464857
std,22.831283,126.526087,18.370642,0.166103,0.251923,3.519423,5.089328,0.474114,0.179893,0.348837,0.266868,0.184326,0.257671,29.764108
min,1900.0,3.344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1974.0,175.093,13.0,0.453,0.343,2.0,47.109,0.0,0.034,0.0969,0.0,0.0983,0.346,95.6
50%,1992.0,214.893,27.0,0.577,0.549,5.0,50.757,1.0,0.0443,0.422,2.4e-05,0.139,0.564,117.384
75%,2007.0,263.867,41.0,0.686,0.748,8.0,53.518,1.0,0.0763,0.785,0.00955,0.278,0.769,136.321
max,2021.0,5621.218,100.0,0.991,1.0,11.0,65.376,1.0,0.971,0.996,1.0,1.0,1.0,246.381


### Populate Artists DataFrame from CSV

In [4]:
# Read 'artists_transformed.csv' into a DataFrame: artists_df
artists_df = pd.read_csv('data/artists_transformed.csv')
artists_df.dropna(inplace=True)
artists_df['followers'] = pd.to_numeric(artists_df['followers'])
artists_df.head()

Unnamed: 0,id,name,genres,followers,popularity
45,0VLMVnVbJyJ4oyZs2L3Yl2,Las Viudas De Los Bisabuelos,carnaval cadiz,71.0,6
46,0dt23bs4w8zx154C5xdVyl,Los De Capuchinos,carnaval cadiz,63.0,5
47,0pGhoB99qpEJEsBQxgaskQ,Los “Pofesionales”,carnaval cadiz,64.0,7
48,3HDrX2OtSuXLW5dLR85uN3,Los Que No Paran De Rajar,carnaval cadiz,53.0,6
136,22mLrN5fkppmuUPsHx6i2G,Vera Dulova,classical harp harp,59.0,3


In [5]:
artists_df.describe()

Unnamed: 0,followers,popularity
count,305589.0,305589.0
mean,37640.15,19.221013
std,494997.7,17.49296
min,0.0,0.0
25%,151.0,3.0
50%,847.0,15.0
75%,4971.0,31.0
max,78900230.0,100.0


## Creating the Models

### Song Recommender

In [6]:
# Creating song library and sorting tracks based on popularity (only using top 10,000 songs for speed purposes)
song_library = tracks_df.sort_values(by='popularity', ascending=False).head(10000)
song_library.drop(['id_artists'], axis=1, inplace=True)
song_library.reset_index(inplace=True, drop=True)
song_library.head()

Unnamed: 0,id,name,artists,genres,release_year,duration_s,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,4iJyoBOLtHqaGxP12qzhQI,Peaches (feat. Daniel Caesar & Giveon),"Justin Bieber, Daniel Caesar, Giveon",pop rb canadian contemporary rb pop rb canadia...,2021,198.082,100,0.677,0.696,0,53.819,1,0.119,0.321,0.0,0.42,0.464,90.03
1,7lPN2DXiMsVn7XUKtOW1CS,drivers license,Olivia Rodrigo,pop postteen pop,2021,242.014,99,0.585,0.436,10,51.239,1,0.0601,0.721,1.3e-05,0.105,0.132,143.874
2,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,Masked Wolf,australian hip hop,2021,132.78,98,0.778,0.695,4,53.135,0,0.0913,0.175,0.0,0.15,0.472,149.996
3,5QO79kh1waicV47BqGRL3g,Save Your Tears,The Weeknd,canadian contemporary rb canadian pop pop,2020,215.627,97,0.68,0.826,0,54.513,1,0.0309,0.0212,1.2e-05,0.543,0.644,118.051
4,6tDDoYIxWvMLTdKpjFkc1B,telepatía,Kali Uchis,colombian pop pop,2020,160.191,97,0.653,0.524,11,50.984,0,0.0502,0.112,0.0,0.203,0.553,83.97


In [18]:
# Train a CountVectorizer on the song_library genres
song_vectorizer = CountVectorizer()
song_vectorizer.fit(song_library['genres'])

In [19]:
# Main Song Recommendation Function
def recommend_songs(genres, song_data, count, song_library=song_library, song_vectorizer=song_vectorizer):
    # Numeric columns (audio features) in song_library DataFrame
    num_cols = ['release_year', 'duration_s', 'popularity', 'danceability', 'energy', 'key', 'loudness',
                'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
        
    # Create vector from "genres" data (text data)
    text_vec1 = song_vectorizer.transform(genres).toarray()

    # Create vector from numerical columns for given song
    num_vec1 = song_data

    # Initialise empty list to store similarity scores
    sim_scores=  []

    # For every song/track in song library, determine cosine similarity with given song
    for index, row in song_library.iterrows():
        name = row['name']

        # Create vector from "genres" field for other songs
        text_vec2 = song_vectorizer.transform(song_library[song_library['name']==name]['genres']).toarray()

        # Create vector from numerical columns for other songs
        num_vec2 = song_library[song_library['name']==name][num_cols].to_numpy()

        # Calculate cosine similarity using text vectors
        text_sim = cosine_similarity(text_vec1, text_vec2)[0][0]

        # Calculate cosine similarity using numerical vectors
        num_sim = cosine_similarity(num_vec1, num_vec2)[0][0]

        # Take average of both similarity scores and add to list of similarity scores
        sim = (text_sim + num_sim)/2
        sim_scores.append(sim)

    # Add new column containing similarity scores to song_library DataFrame
    song_library['similarity'] = sim_scores

    # Sort DataFrame based on "similarity" column
    song_library.sort_values(by=['similarity', 'popularity', 'release_year'], ascending=[False, False, False], inplace=True)

    # Create DataFrame "recommended_songs" containing 5 songs that are most similar to the given song and return this DataFrame
    recommended_songs = song_library[['name', 'artists', 'release_year']][2:(2+count)]
    return recommended_songs

# Function to recommend songs based on given song name in song_library
def recommend_songs_test(song_name):
    try:
        num_cols = ['release_year', 'duration_s', 'popularity', 'danceability', 'energy', 'key', 'loudness',
                    'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
        
        # Get genres and numerical data for given song in song_library
        genres = song_library[song_library['name']==str(song_name)]['genres']
        song_data = song_library[song_library['name']==str(song_name)][num_cols].to_numpy()
        return recommend_songs(genres, song_data, 5)
    except:
        # If given song is not found in song library then display message
        print(song_name, "not found in song library")



In [20]:
# Testing the song recommender
# Finding songs like Eye of the Tiger by Survivor
recommend_songs_test('Eye of the Tiger')

[[0.99989262]
 [0.99989619]]
[[0.99984614 0.9998507 ]
 [0.9998465  0.99985244]]
[[0.99984614 0.9998507 ]
 [0.9998465  0.99985244]]
[[0.99974964 0.99974447]
 [0.99975614 0.99975556]]
[[0.99974964 0.99974447]
 [0.99975614 0.99975556]]
[[0.99980718]
 [0.9998173 ]]
[[0.99993725]
 [0.99994637]]
[[0.99993151 0.99993639]
 [0.99992139 0.99993315]]
[[0.99993151 0.99993639]
 [0.99992139 0.99993315]]
[[0.99997048]
 [0.99997136]]
[[0.99995017 0.99959725]
 [0.99994977 0.9996022 ]]
[[0.99995017 0.99959725]
 [0.99994977 0.9996022 ]]
[[0.9998466 ]
 [0.99985405]]
[[0.99994164]
 [0.99994316]]
[[0.99996043]
 [0.99996622]]
[[0.99990985 0.99909139]
 [0.99990999 0.99911907]]
[[0.99990985 0.99909139]
 [0.99990999 0.99911907]]
[[0.99997989]
 [0.99997942]]
[[0.9995668  0.99952049]
 [0.99958182 0.99954208]]
[[0.9995668  0.99952049]
 [0.99958182 0.99954208]]
[[0.99947734 0.99942844 0.99989823]
 [0.99948858 0.99943184 0.99990789]]
[[0.99947734 0.99942844 0.99989823]
 [0.99948858 0.99943184 0.99990789]]
[[0.999477

Unnamed: 0,name,artists,release_year
9286,"Burning Heart - From ""Rocky IV"" Soundtrack",Survivor,1993
8053,Burnin' for You,Blue Öyster Cult,1981
1841,(Don't Fear) The Reaper,Blue Öyster Cult,1976
495,Don't Stop Believin',Journey,1981
2495,Don't Stop Believin',Journey,2001


### Artist Recommender

In [10]:
# Creating artist library and sorting artists based on popularity and followers (only using top 10000 artists for speed purposes)
artist_library = artists_df.sort_values(by=['popularity', 'followers'], ascending=False).head(10000)
artist_library.reset_index(inplace=True, drop=True)
artist_library.head()

Unnamed: 0,id,name,genres,followers,popularity
0,1uNFoZAHBGtllmzznpCI3s,Justin Bieber,canadian pop pop postteen pop,44606973.0,100
1,3TVXtAsR1Inumwj472S9r4,Drake,canadian hip hop canadian pop hip hop pop rap ...,54416812.0,98
2,06HL4z0CvFAxyc27GXpf02,Taylor Swift,pop postteen pop,38869193.0,98
3,4q3ewBCX7sLwd24euuV69X,Bad Bunny,latin reggaeton trap latino,32244734.0,98
4,3Nrfpe0tUJi4K4DXYWgMUX,BTS,kpop kpop boy group,31623813.0,96


In [11]:
# Train a CountVectorizer on the artist_library genres
artist_vectorizer = CountVectorizer()
artist_vectorizer.fit(artist_library['genres'])

In [12]:
# Main Artist Recommendation Function
def recommend_artists(genres, artist_data, count, artist_library=artist_library, artist_vectorizer=artist_vectorizer):
    # Numeric columns (audio features) in artist_library DataFrame
    num_cols = ['followers', 'popularity']

    # Create vector from "genres" field (text data) for given artist
    text_vec1 = artist_vectorizer.transform(genres).toarray()

    # Create vector from numerical columns for given song
    num_vec1 = artist_data

    # Initialise empty list to store similarity scores
    sim_scores = []

    # For every artist in artist library, determine cosine similarity with given artist
    for index, row in artist_library.iterrows():
        name = row['name']

        # Create vector from "genres" field for other artists
        text_vec2 = artist_vectorizer.transform(artist_library[artist_library['name']==name]['genres']).toarray()

        # Create vector from numerical columns for other songs
        num_vec2 = artist_library[artist_library['name']==name][num_cols].to_numpy()

        # Calculate cosine similarity using text vectors
        text_sim = cosine_similarity(text_vec1, text_vec2)[0][0]

        # Calculate cosine similarity using numerical vectors
        num_sim = cosine_similarity(num_vec1, num_vec2)[0][0]

        # Take average of both similarity scores and add to list of similarity scores
        sim = (text_sim + num_sim)/2
        sim_scores.append(sim)

    # Add new column containing similarity scores to artist_library DataFrame
    artist_library['similarity'] = sim_scores

    # Sort DataFrame based on "similarity" column
    artist_library.sort_values(by=['similarity', 'popularity', 'followers'], ascending=[False, False, False], inplace=True)

    # Create DataFrame "recommended_artists" containing 5 artists that are most similar to the given artist, sort and return this DataFrame
    recommended_artists = artist_library[['name', 'genres', 'followers', 'popularity']][2:(2+count)]
    recommended_artists.sort_values(by=['popularity', 'followers'], ascending=[False, False], inplace=True)
    return recommended_artists

# Function to recommend artists based on given artist name in artist_library
def recommend_artists_test(artist_name):
    try:
        # Numeric columns (audio features) in artist_library DataFrame
        num_cols = ['followers', 'popularity']

        # Get genres and numerical data for given artist in artist_library
        genres = artist_library[artist_library['name']==str(artist_name)]['genres']
        artist_data = artist_library[artist_library['name']==str(artist_name)][num_cols].to_numpy()

        return recommend_artists(genres, artist_data, 5)
    except:
        # If given artist is not found in artist library then display message
        print(artist_name, "not found in artist library")


In [13]:
# Testing the artist recommender
# Finding artists like Post Malone
recommend_artists_test('Post Malone')

Unnamed: 0,name,genres,followers,popularity
6,Juice WRLD,chicago rap melodic rap,16996777.0,96
30,Lil Uzi Vert,melodic rap philly rap rap trap,11209483.0,91
119,A Boogie Wit da Hoodie,melodic rap pop rap rap trap,5634088.0,86
368,Lil Skies,melodic rap pop rap rap trap,4679716.0,80
7730,916frosty,emo rap melodic rap sad rap,64900.0,59


## Connect to Spotify API (Using Spotipy)

### Connect With No Authentication

In [14]:
# Using dotenv to load environment variables (create a .env file in the same directory as this script)
# Then, add the following lines to the .env file:
# CLIENT_ID=your_client_id
# CLIENT_SECRET=your_client_secret
# REDIRECT_URI=your_redirect_uri
client_id = os.environ.get('CLIENT_ID')
client_secret = os.environ.get('CLIENT_SECRET')
redirect_uri = os.environ.get('REDIRECT_URI')

# Connect to Spotify API without User Authentication
sp_noauth = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

results = sp_noauth.search(q='weezer', limit=10)
for idx, track in enumerate(results['tracks']['items']):
    print(idx, track['name'])

0 Buddy Holly
1 Island In The Sun
2 Say It Ain't So
3 Undone - The Sweater Song
4 Beverly Hills
5 I Just Threw Out The Love Of My Dreams
6 Hash Pipe
7 Africa
8 My Name Is Jonas
9 Pork And Beans


### Connect With Authentication (this is needed to access listening history)

In [15]:
# This scope allows the application to read the user's recently played tracks
scope = 'user-read-recently-played'

def authenticate_user(username):
    token = util.prompt_for_user_token(username,
                                        scope,
                                        client_id=client_id,
                                        client_secret=client_secret,
                                        redirect_uri=redirect_uri)
    return token

### Get User Listening History Function

In [16]:
def get_user_listening_history(token, count):
    sp = spotipy.Spotify(auth=token)
    results = sp.current_user_recently_played(limit=count)
    return results

### Transform the Artist Data We Received from the API

In [17]:
def get_artist_info(artist_id):
    try:
        artist_info = sp_noauth.artist(artist_id)
        return {
            'id': artist_id,
            'name': artist_info['name'],
            'genres': artist_info['genres'],
            'followers': artist_info['followers']['total'],
            'popularity': artist_info['popularity']
        }
    except:
        return None

def transform_user_artists(df):
    new_data = [get_artist_info(artist_id) for artist_id in df['id']]
    new_df = pd.DataFrame(new_data)
    new_df['genres'] = new_df['genres'].apply(lambda genres: ''.join([genre.lower() for genre in genres]).strip())
    return new_df

### Transform the Track Data We Received from the API

In [18]:
def get_track_info(track_id, track_name):
    try:
        track_info = sp_noauth.track(track_id)
        track_features = sp_noauth.audio_features(track_id)
        track_features_list = [track_features[0][key] for key in track_features[0].keys()]
        artist_names = [artist['name'] for artist in track_info['artists']]
        artist_ids = [artist['id'] for artist in track_info['artists']]

        return {
            'id': track_id,
            'name': track_name,
            'artists': artist_names,
            'id_artists': artist_ids,
            'release_year': track_info['album']['release_date'][:4],
            'duration_s': track_features_list[16]/1000,
            'popularity': track_info['popularity'],
            'danceability': track_features_list[0],
            'energy': track_features_list[1],
            'key': track_features_list[2],
            'loudness': track_features_list[3] + 60,
            'mode': track_features_list[4],
            'speechiness': track_features_list[5],
            'acousticness': track_features_list[6],
            'instrumentalness': track_features_list[7],
            'liveness': track_features_list[8],
            'valence': track_features_list[9],
            'tempo': track_features_list[10]
        }
    except:
        return None

def transform_user_tracks(df):
    new_data = [get_track_info(track_id, track_name) for track_id, track_name in df[['track.id', 'track.name']].values]
    new_df = pd.DataFrame(new_data)
    new_df['artists'] = new_df['artists'].apply(lambda artists: ', '.join(artists).strip())
    return new_df

## Final Recommendation System

In [19]:

def final_recommendation_algorithm(username, song_library, artist_library):
    # Authenticate user
    token = authenticate_user(username)

    # Get user listening history (50 most recent tracks)
    results = get_user_listening_history(token, 50)

    # Create DataFrames for user tracks and artists (still need to be transformed)
    user_tracks_df = pd.json_normalize(results['items'])
    user_artists_df = pd.concat([pd.DataFrame(pd.json_normalize(y)) for x in user_tracks_df['track.artists'] for y in x], ignore_index=True)

    # Transform user tracks DataFrame
    user_artists_df = transform_user_artists(user_artists_df)

    # Transform user artists DataFrame
    user_tracks_df = transform_user_tracks(user_tracks_df)
    user_tracks_df.insert(4, 'genres', user_tracks_df.apply(lambda x: str(user_artists_df.loc[user_artists_df['id'].isin(x['id_artists'])]['genres'].tolist()).translate(str.maketrans('', '', string.punctuation)), axis=1))
    user_tracks_df['release_year'] = pd.to_numeric(user_tracks_df['release_year'])

    # Average numerical features of user tracks_df
    user_tracks_genres = user_tracks_df['genres'].str.cat(sep=' ').split()
    avg_user_tracks_data = user_tracks_df.drop(['id', 'name', 'artists', 'id_artists', 'genres'], axis=1).mean().to_numpy().reshape(1, -1)
    # Average numerical features of user artists_df
    user_artists_genres = user_artists_df['genres'].str.cat(sep=' ').split()
    avg_user_artists_data = user_artists_df.drop(['id', 'name', 'genres'], axis=1).mean().to_numpy().reshape(1, -1)

    # Add user tracks to song_library, overwrite old entry if the song is already in the database
    song_library = pd.concat([song_library, user_tracks_df]).drop_duplicates(subset=['id'], keep='last')
    # Add user artists to artist_library, overwrite old entry if the artist is already in the database
    artist_library = pd.concat([artist_library, user_artists_df]).drop_duplicates(subset=['id'], keep='last')

    # Train a CountVectorizer on the updated song_library genres
    song_vectorizer = CountVectorizer()
    song_vectorizer.fit(song_library['genres'])
    # Train a CountVectorizer on the updated artist_library genres
    artist_vectorizer = CountVectorizer()
    artist_vectorizer.fit(artist_library['genres'])

    # Get recommendations based off the last songs in the user's listening history (10 recommendations)
    recommended_songs = recommend_songs(user_tracks_genres, avg_user_tracks_data, 10, song_library, song_vectorizer)
    # Get recommendations based off the last artists in the user's listening history (10 recommendations)
    recommended_artists = recommend_artists(user_artists_genres, avg_user_artists_data, 10, artist_library, artist_vectorizer)

    return recommended_songs, recommended_artists

### Run The Recommendation Algorithm

In [20]:
print('Welcome to the Spotify Recommendation System!')
user_input = input('Enter your Spotify username: ')
print('Please wait while we generate your recommendations...')
recommended_songs, recommended_artists = final_recommendation_algorithm(user_input, song_library, artist_library)

print('\nRecommended Songs:')
print(recommended_songs)
print('\n\nRecommended Artists:')
print(recommended_artists)

Welcome to the Spotify Recommendation System!
Please wait while we generate your recommendations...

Recommended Songs:
                                                   name  \
4680                                       Ice Ice Baby   
8591                                   Bad Boy for Life   
9142  Get Money (feat. Junior M.A.F.I.A.) - 2007 Rem...   
9444                                           All Caps   
8166  Downtown (feat. Melle Mel, Grandmaster Caz, Ko...   
6895  If I Ruled the World (Imagine That) (feat. Lau...   
2509                                        It's Tricky   
5936                                      Ms. Fat Booty   
7215     Still Not a Player (feat. Joe) - Radio Version   
2               Rapp Snitch Knishes feat. Mr. Fantastik   

                                                artists  release_year  
4680                                        Vanilla Ice          2008  
8591                       Diddy, Black Rob, Mark Curry          2001  
9142          