In [None]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools
import requests
import difflib

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import euclidean_distances
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from scipy.spatial.distance import cdist

from collections import defaultdict

# spotify web API
#pip install spotipy 
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

In [None]:
%matplotlib inline

In [None]:
#Spotify for developers credentials
client_id = '********' 
client_secret = '**********'
username = '07maria***'
redirect_uri = 'https://developer.spotify.com/dashboard/applications/*******8'

#authorization scope based on what the user wants to do
#Listening History: user-read-recently-played, user-top-read, user-read-playback-position
#Playlists: playlist-modify-public, playlist-modify-private, playlist-read-private, playlist-read-colllaborative
scope = 'playlist-modify-public user-top-read user-library-read'

In [None]:
def authenticate(redirect_uri, client_cred_manager, username, scope, client_id, client_secret):
    
    #authenticates to use spotify API
    
    #spotify api auth

    sp = spotipy.Spotify(client_credentials_manager = client_cred_manager)
    token = util.prompt_for_user_token(username, scope, client_id, client_secret, redirect_uri)
    if token:
        sp = spotipy.Spotify(auth = token)
    else:
        print("Can't get token for", username)
    return sp

In [None]:
# gets authorization token from spotify
sp = authenticate(redirect_uri, SpotifyClientCredentials(client_id, client_secret), username, scope, client_id, client_secret)

In [None]:
# load the data.csv into a pandas dataframe
spotify_df = pd.read_csv('https://www.dropbox.com/s/egnlgbg9nllfi4r/data_o.csv?dl=1')

In [None]:
# to print pandas dataframes
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", None)

In [None]:
# change column names of the spotify_df to match the columns names of the data from spotify
spotify_df1 = spotify_df.rename(columns = {'name': 'track_name'})


In [None]:
# DATA VISUALIZATION

# K-means clustering algorithm to divide songs into clusters
song_cluster_pipeline = Pipeline ([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters = 20, verbose = 2, n_jobs = 4))], verbose = True)

X = spotify_df1.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
spotify_df1['cluster_label'] = song_cluster_labels

In [None]:
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components = 2))])
song_embedding = pca_pipeline.fit_transform(X)

projection = pd.DataFrame(columns = ['x', 'y'], data = song_embedding)
projection['title'] = spotify_df1['track_name']
projection['cluster'] = spotify_df1['cluster_label']

In [None]:
fig = px.scatter(projection, x = 'x', y = 'y', color = 'cluster', hover_data = ['x', 'y', 'title'])
fig.show()

In [None]:
def append_audio_features(df, spotify_auth, return_feat_df = False):
    # fetches audio features for all songs in a dataframe and appends as rows to the dataframe
    
    audio_features = spotify_auth.audio_features(df["track_id"])
    
    #catch and delete songs with no audio features
    if None in audio_features:
        NA_indx = [i for i, v in enumarate(audio_features) if v == None]
        df.drop(NA_idx, inplace = True)
        for i in NA_idx:
            audio_features.op(i)
        assert len(audio_features) == len(df["track_id"][:])
        feature_cols = list(audio_features[0].keys())[:-7]
        features_list = []
        
        for features in audio_features:
            try:
                song_features = [features[col] for col in feature_cols]
                features_list.append(song_features)
            except TypeError:
                pass
        df_features = pd.DataFrame(features_list, columns = feature_cols)
        
        df = pd.concat([df, df_features], axis = 1)
        
    return df

In [None]:
def saved_songs_df(api_results):
    
    #returns a dataframe with the user's saved songs
    
    #create lists for df columns
    track_name = []
    track_id = []
    artist = []
    album = []
    duration = []
    popularity = []
    
    #loop through api_results
    for i in api_results["items"]:
        try:
            track_name.append(i["track"]['name'])
            track_id.append(i["track"]['id'])
            artist.append(i["track"]["artists"][0]["name"])
            album.append(i["track"]["album"]["name"])
            duration.append(i["track"]["duration_ms"])
            popularity.append(i["track"]["popularity"])
        except TypeError:
            pass
        
        #create final df
        df = pd.DataFrame({"track_name": track_name, 
                          "track_id": track_id,
                          "artists": artist,
                            "album": album,
                          "duration": duration,
                          "popularity": popularity})
    return df

In [None]:
def playlist_df(api_results, sp = None, append_audio = True):
    #reads spotipy query results for a playlist and returns a dataframe
    # .recommendations

    dataf = saved_songs_df(api_results["tracks"])

    if append_audio == True:
        assert sp != None, "sp needs to be specified for appending audio features"
        df = append_audio_features(dataf, sp)
    return df

In [None]:
# playlist uri from spotify
playlist_uri = 'spotify:playlist:4NJ80DuQgEz9NCdsQbPC40' 
playlist = sp.playlist(playlist_uri)
play_df = playlist_df(playlist, sp = sp)
play_df

In [None]:
# get seed tracks for recommendations
seed_tracks = play_df["track_id"].tolist()

In [None]:
# Spotify's recommendations based on playlist

def spotify_recommendations(api_results):
    #returns a dataframe with Spotify's recommended songs
    track_name = []
    track_id = []
    artist = []
    album = []
    duration = []
    popularity = []

    for items in api_results['tracks']:
        try:
            track_name.append(items['name'])
            track_id.append(items['id'])
            artist.append(items["artists"][0]["name"])
            album.append(items["album"]["name"])
            duration.append(items["duration_ms"])
            popularity.append(items["popularity"])
        except TypeError:
            pass
        
    df = pd.DataFrame({"track_name": track_name,
                           "track_id": track_id,
                           "artists": artist,
                           "album": album,
                           "duration": duration,
                           "popularity": popularity})
    return df

In [None]:
#create recommendation df from multiple recommendations
recomm_dfs = []

for i in range(5, len(seed_tracks) +1, 5):
    recs = sp.recommendations(seed_tracks = seed_tracks[i-5:i], limit = 10)
    recs_df = append_audio_features(spotify_recommendations(recs), sp)
    recomm_dfs.append(recs_df)

In [None]:
recs_df = pd.concat(recomm_dfs)
recs_df.reset_index(drop = True, inplace = True)

In [None]:
# Program's recommendations based on playlist

# columns for the final recommendations dataframe
num_cols = ['valence','acousticness', 'danceability', 'duration_ms', 'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo', 'tempo']

In [None]:
# recommend songs based on the user's playlist play_df
# convert df to dictionary
play_list = play_df.to_dict('record')

In [None]:
def find_song(name, artists):
    #finds songs not in spotify_df using spotify with the track's name and artists
    
    song_data = defaultdict()
    r = sp.search(q= 'artist: {} track: {}'.format(artists, name), limit=1)
    
    
    results = r['tracks']['items'][0]
    track_id = results['id']
    
    audio_features = sp.audio_features(track_id)[0]
    
    song_data['track_name'] = [name]
    song_data['artists'] = [artists]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]
    
    for key, value in audio_features.items():
        song_data[key] = value
    
    return pd.DataFrame(song_data)

In [None]:
def get_song_data(song, spotify_df):
    
    try:
        song_data = spotify_df[(spotify_df['track_name'] == song['track_name']) 
                                & (spotify_df['artists'] == song['artists'])].iloc[0]
        return song_data
    except IndexError:
        return find_song(song['track_name'], song['artists'])

In [None]:
def get_mean_vector(play_list, spotify_df):
    
    song_vects = []
    
    #only the first 45 songs in playlist as it has over 100 songs
    for song in play_list[:45]:
        song_data = get_song_data(song, spotify_df)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or database'.format(song['track_name']))
            continue
        song_vect = song_data[num_cols].values
        song_vects.append(song_vect)
        
    song_matrix = np.array(list(song_vects))
    
    return np.mean(song_matrix, axis = 0)

In [None]:
def flatten_dict_list(dict_list):
    flattened_dict = defaultdict()
    
    for key in dict_list[0].keys():
        flattened_dict[key] = []
        
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
    return flattened_dict

In [None]:
def recommend_songs(song_list, spotify_df, n_songs = 25):
    
    metadata_cols = ['track_name', 'artists', 'popularity']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_df)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_df[num_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_df.iloc[index]
    rec_songs = rec_songs[~rec_songs['track_name'].isin(song_dict['track_name'])]
    
    return rec_songs[metadata_cols].to_dict(orient = 'records')

In [None]:
# recommended songs based on playlist
fin_recs = recommend_songs(play_list, spotify_df1)

In [None]:
final_df = pd.DataFrame(fin_recs, columns = ["track_name", "track_id",
                                          "artists",
                                            "album",
                                          "duration",
                                          "popularity"])
final_df.reset_index(drop = True, inplace = True)

In [None]:
# program's recommendations
final_df

In [None]:
# spotify recommendations
recs_df