In [9]:
import pandas as pd
import numpy as np
import json
import re
import sys
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")


# # SUMMARY
#

# # 1. Data Preparation


spotify_df = pd.read_csv("./datasets/data.csv")

data_w_genre = pd.read_csv("./datasets/data_w_genres.csv")


data_w_genre['genres_upd'] = data_w_genre['genres'].apply(
    lambda x: [re.sub(' ', '_', i) for i in re.findall(r"'([^']*)'", x)])


spotify_df['artists_upd_v1'] = spotify_df['artists'].apply(
    lambda x: re.findall(r"'([^']*)'", x))


spotify_df['artists_upd_v2'] = spotify_df['artists'].apply(
    lambda x: re.findall('\"(.*?)\"', x))
spotify_df['artists_upd'] = np.where(spotify_df['artists_upd_v1'].apply(
    lambda x: not x), spotify_df['artists_upd_v2'], spotify_df['artists_upd_v1'])

spotify_df['artists_song'] = spotify_df.apply(
    lambda row: row['artists_upd'][0]+" " + row['name'], axis=1)


spotify_df.sort_values(['artists_song', 'release_date'],
                       ascending=False, inplace=True)


spotify_df.drop_duplicates('artists_song', inplace=True)

artists_exploded = spotify_df[['artists_upd', 'id']].explode('artists_upd')


artists_exploded_enriched = artists_exploded.merge(
    data_w_genre, how='left', left_on='artists_upd', right_on='artists')
artists_exploded_enriched_nonnull = artists_exploded_enriched[~artists_exploded_enriched.genres_upd.isnull(
)]


artists_genres_consolidated = artists_exploded_enriched_nonnull.groupby(
    'id')['genres_upd'].apply(list).reset_index()

artists_genres_consolidated['consolidates_genre_lists'] = artists_genres_consolidated['genres_upd'].apply(
    lambda x: list(set(list(itertools.chain.from_iterable(x)))))

spotify_df = spotify_df.merge(artists_genres_consolidated[[
                              'id', 'consolidates_genre_lists']], on='id', how='left')

spotify_df['year'] = spotify_df['release_date'].apply(
    lambda x: x.split('-')[0])

float_cols = spotify_df.dtypes[spotify_df.dtypes == 'float64'].index.values


ohe_cols = 'popularity'


# create 5 point buckets for popularity
spotify_df['popularity_red'] = spotify_df['popularity'].apply(
    lambda x: int(x/5))


# tfidf can't handle nulls so fill any null values with an empty list
spotify_df['consolidates_genre_lists'] = spotify_df['consolidates_genre_lists'].apply(
    lambda d: d if isinstance(d, list) else [])


def ohe_prep(df, column, new_name):
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used

    Returns: 
        tf_df: One hot encoded features 
    """

    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop=True, inplace=True)
    return tf_df

# function to build entire feature set


def create_feature_set(df, float_cols):
    """ 
    Process spotify df to create a final set of features that will be used to generate recommendations

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        float_cols (list(str)): List of float columns that will be scaled 

    Returns: 
        final: final set of features 
    """

    # tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(
        df['consolidates_genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop=True, inplace=True)

    #explicity_ohe = ohe_prep(df, 'explicit','exp')
    year_ohe = ohe_prep(df, 'year', 'year') * 0.5
    popularity_ohe = ohe_prep(df, 'popularity_red', 'pop') * 0.15

    # scale float columns
    floats = df[float_cols].reset_index(drop=True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(
        floats), columns=floats.columns) * 0.2

    # concanenate all features
    final = pd.concat(
        [genre_df, floats_scaled, popularity_ohe, year_ohe], axis=1)

    # add song id
    final['id'] = df['id'].values

    return final


complete_feature_set = create_feature_set(
    spotify_df, float_cols=float_cols)  # .mean(axis = 0)


# # 3. Connect To Spotify API

client_id = '85b78072ce59404892faa25eac54f7bd'
client_secret = '96691d10b1844530bacbccdf7b8d2641'


scope = 'user-library-read'

if len(sys.argv) > 1:
    username = sys.argv[1]
else:
    print("Usage: %s username" % (sys.argv[0],))
    sys.exit()


auth_manager = SpotifyClientCredentials(
    client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=auth_manager)
token = util.prompt_for_user_token(
    scope, client_id=client_id, client_secret=client_secret,redirect_uri='http://localhost:8881/')


sp = spotipy.Spotify(auth=token)

id_name = {}
list_photo = {}
for i in sp.current_user_playlists()['items']:

    id_name[i['name']] = i['uri'].split(':')[2]
    list_photo[i['uri'].split(':')[2]] = i['images'][0]['url']


id_name


def create_necessary_outputs(playlist_name,id_dic, df):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        playlist_name (str): name of the playlist you'd like to pull from the spotify API
        id_dic (dic): dictionary that maps playlist_name to playlist_id
        df (pandas dataframe): spotify datafram
        
    Returns: 
        playlist: all songs in the playlist THAT ARE AVAILABLE IN THE KAGGLE DATASET
    """
    
    #generate playlist dataframe
    playlist = pd.DataFrame()
    playlist_name = playlist_name

    for ix, i in enumerate(sp.playlist(id_dic[playlist_name])['tracks']['items']):
        #print(i['track']['artists'][0]['name'])
        playlist.loc[ix, 'artist'] = i['track']['artists'][0]['name']
        playlist.loc[ix, 'name'] = i['track']['name']
        playlist.loc[ix, 'id'] = i['track']['id'] # ['uri'].split(':')[2]
        playlist.loc[ix, 'url'] = i['track']['album']['images'][1]['url']
        playlist.loc[ix, 'date_added'] = i['added_at']

    playlist['date_added'] = pd.to_datetime(playlist['date_added'])  
    
    playlist = playlist[playlist['id'].isin(df['id'].values)].sort_values('date_added',ascending = False)
    
    return playlist


id_name


playlist_ = create_necessary_outputs('ambient shit ig?', id_name,spotify_df)


get_ipython().system('pip install scikit-image')


from skimage import io
import matplotlib.pyplot as plt

def visualize_songs(df):
    """ 
    Visualize cover art of the songs in the inputted dataframe

    Parameters: 
        df (pandas dataframe): Playlist Dataframe
    """
    
    temp = df['url'].values
    plt.figure(figsize=(15,int(0.625 * len(temp))))
    columns = 5
    
    for i, url in enumerate(temp):
        plt.subplot(len(temp) / columns + 1, columns, i + 1)

        image = io.imread(url)
        plt.imshow(image)
        plt.xticks(color = 'w', fontsize = 0.1)
        plt.yticks(color = 'w', fontsize = 0.1)
        plt.xlabel(df['name'].values[i], fontsize = 12)
        plt.tight_layout(h_pad=0.4, w_pad=0)
        plt.subplots_adjust(wspace=None, hspace=None)

    plt.show()


playlist_


def generate_playlist_feature(complete_feature_set, playlist_df, weight_factor):
    """ 
    Summarize a user's playlist into a single vector

    Parameters: 
        complete_feature_set (pandas dataframe): Dataframe which includes all of the features for the spotify songs
        playlist_df (pandas dataframe): playlist dataframe
        weight_factor (float): float value that represents the recency bias. The larger the recency bias, the most priority recent songs get. Value should be close to 1. 
        
    Returns: 
        playlist_feature_set_weighted_final (pandas series): single feature that summarizes the playlist
        complete_feature_set_nonplaylist (pandas dataframe): 
    """
    
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id', axis = 1).mean(axis =0)
    complete_feature_set_playlist = complete_feature_set_playlist.merge(playlist_df[['id','date_added']], on = 'id', how = 'inner')
    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id', axis = 1)
    
    playlist_feature_set = complete_feature_set_playlist.sort_values('date_added',ascending=False)

    most_recent_date = playlist_feature_set.iloc[0,-1]
    
    for ix, row in playlist_feature_set.iterrows():
        playlist_feature_set.loc[ix,'months_from_recent'] = int((most_recent_date.to_pydatetime() - row.iloc[-1].to_pydatetime()).days / 30)
        
    playlist_feature_set['weight'] = playlist_feature_set['months_from_recent'].apply(lambda x: weight_factor ** (-x))
    
    playlist_feature_set_weighted = playlist_feature_set.copy()
    #print(playlist_feature_set_weighted.iloc[:,:-4].columns)
    playlist_feature_set_weighted.update(playlist_feature_set_weighted.iloc[:,:-4].mul(playlist_feature_set_weighted.weight,0))
    playlist_feature_set_weighted_final = playlist_feature_set_weighted.iloc[:, :-4]
    #playlist_feature_set_weighted_final['id'] = playlist_feature_set['id']
    
    return playlist_feature_set_weighted_final.sum(axis = 0), complete_feature_set_nonplaylist


complete_feature_set_playlist_vector_, complete_feature_set_nonplaylist_ = generate_playlist_feature(complete_feature_set, playlist_, 1.09)
#complete_feature_set_playlist_vector_chill, complete_feature_set_nonplaylist_chill = generate_playlist_feature(complete_feature_set, playlist_chill, 



complete_feature_set_playlist_vector_.shape


def generate_playlist_recos(df, features, nonplaylist_features):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        df (pandas dataframe): spotify dataframe
        features (pandas series): summarized playlist feature
        nonplaylist_features (pandas dataframe): feature set of songs that are not in the selected playlist
        
    Returns: 
        non_playlist_df_top_40: Top 40 recommendations for that playlist
    """
    
    non_playlist_df = df[df['id'].isin(nonplaylist_features['id'].values)]
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('id', axis = 1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(40)
    non_playlist_df_top_40['url'] = non_playlist_df_top_40['id'].apply(lambda x: sp.track(x)['album']['images'][1]['url'])
    
    return non_playlist_df_top_40


playlist_top40 = generate_playlist_recos(spotify_df, complete_feature_set_playlist_vector_, complete_feature_set_nonplaylist_)


playlist_top40



You should consider upgrading via the 'c:\users\ishku\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,valence,year,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists,popularity_red,sim,url
11571,0.456,['UMI'],0.84,199227,0.344,0,630Ug0XtmhhFvAKo0PNuEI,3.4e-05,5,0.35,...,0.526,2018,[UMI],[],[UMI],UMI Remember Me,"[alternative_r&b, pop, indie_r&b, bedroom_soul]",14,0.746842,https://i.scdn.co/image/ab67616d00001e022b2fe5...
50163,0.964,['Pink Sweat$'],0.661,189000,0.14,1,3Um6KoMmiyZqHC9e4XNCoF,1.6e-05,6,0.106,...,0.306,2018,[Pink Sweat$],[],[Pink Sweat$],Pink Sweat$ Honesty,"[alternative_r&b, pop, indie_r&b, bedroom_soul]",13,0.744817,https://i.scdn.co/image/ab67616d00001e027a4e17...
55882,0.454,['Omar Apollo'],0.782,127273,0.494,0,2NCBjlH7FHEG7hXcnvUaWA,6.3e-05,0,0.134,...,0.401,2018,[Omar Apollo],[],[Omar Apollo],Omar Apollo Ugotme,"[indie_pop, indie_r&b, pop, bedroom_soul, bedr...",12,0.74257,https://i.scdn.co/image/ab67616d00001e02819ce0...
81053,0.606,['Khalid'],0.551,210667,0.44,0,5kfNriitmkNE8mUbZ7gbq8,3e-05,10,0.11,...,0.341,2018,[Khalid],[],[Khalid],Khalid Saturday Nights,"[alternative_r&b, pop]",14,0.732703,https://i.scdn.co/image/ab67616d00001e0260624c...
103495,0.915,['Giveon'],0.478,195406,0.373,0,62d6YXEYxmMWAuLpw1EysL,6e-06,5,0.128,...,0.354,2018,[Giveon],[],[Giveon],Giveon Garden Kisses,"[alternative_r&b, indie_r&b]",12,0.728014,https://i.scdn.co/image/ab67616d00001e0236e776...
81063,0.0765,['Khalid'],0.596,229320,0.552,0,6zeeWid2sgw4lap2jV61PZ,0.334,0,0.104,...,0.112,2018,[Khalid],[],[Khalid],Khalid Better,"[alternative_r&b, pop]",15,0.707729,https://i.scdn.co/image/ab67616d00001e0260624c...
40395,0.272,['Sabrina Claudio'],0.666,209885,0.39,0,3cQmqM0awej9iAESjhKrI9,9e-06,5,0.111,...,0.325,2017,[Sabrina Claudio],[],[Sabrina Claudio],Sabrina Claudio Confidently Lost,"[alternative_r&b, pop, indie_r&b]",12,0.687478,https://i.scdn.co/image/ab67616d00001e02ca1582...
52244,0.617,['Peach Tree Rascals'],0.676,210000,0.525,0,4ja2gzrNh9VNigzoXfmbwD,0.0,9,0.356,...,0.421,2019,[Peach Tree Rascals],[],[Peach Tree Rascals],Peach Tree Rascals Mariposa,"[alternative_r&b, pop, indie_r&b]",16,0.655875,https://i.scdn.co/image/ab67616d00001e02180bec...
81049,0.0626,"['Khalid', 'Swae Lee']",0.727,238893,0.72,1,5jyyPsIGM2yqkZN9R3TmvN,1e-06,11,0.176,...,0.589,2018,"[Khalid, Swae Lee]",[],"[Khalid, Swae Lee]",Khalid The Ways (with Swae Lee),"[alternative_r&b, pop, trap]",12,0.64642,https://i.scdn.co/image/ab67616d00001e02c027ad...
81048,0.891,['Khalid'],0.707,257960,0.484,0,25YvGDl2zSE0pH8jrMZ6aY,0.00344,9,0.256,...,0.43,2017,[Khalid],[],[Khalid],Khalid Therapy,"[alternative_r&b, pop]",12,0.64294,https://i.scdn.co/image/ab67616d00001e02988ede...
