In [1]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from Secrets import CLIENT_ID, CLIENT_SECRET, REDIRECT_URL
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from dateutil.relativedelta import relativedelta
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Initialize the Spotify client with OAuth for accessing user library
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=CLIENT_ID,
                                               client_secret=CLIENT_SECRET,
                                               redirect_uri=REDIRECT_URL,
                                               scope="user-library-read"))

df = pd.read_csv('./dataset.csv')
df = df[df.columns[1:]]
df = df.drop(columns=['time_signature', 'duration_ms'])

pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',504)
pd.set_option('display.width',1000)


## Feature Engineering

In [3]:
def feature_engineer(df):
    scaler = MinMaxScaler()
    
    features_to_scale = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']
    features_to_OHE = ['key', 'explicit', 'popularity']
    
    tfidf = TfidfVectorizer()

    #tfidf cannot handle null values
    df['track_genre'] = df['track_genre'].ffill()
    tfidf_matrix = tfidf.fit_transform(df['track_genre'])
    
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre_' + i for i in tfidf.get_feature_names_out()]
    # Reset index of genre_df to allow for clean concatenation later
    genre_df.reset_index(drop = True, inplace=True)

    #scale numerical features
    df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
    
    def perform_OHE(df, column):
        OHE = pd.get_dummies(df[column], drop_first=True, dtype=int, prefix=column)
        df = df.drop(columns=column, axis='columns')
        df = pd.concat([df, OHE], axis=1)
        return df

    #Seperate popularity into 20 buckets -> OHE the popularity
    df['popularity'] = df['popularity'].apply(lambda x: x//5)

    for feature in features_to_OHE:
        df = perform_OHE(df, feature)

    #Combine the main df and the genre_df from the tfidf
    df = pd.concat([df, genre_df], axis=1)
    df = df.drop(columns=['track_genre'], axis = 'columns')
    return df

In [4]:
#6meRpNHvKC1VcBl5MIbVxo - hehe
#1Ob3QykC4dzf3rQwWxE9LV - study
#2dxVgP67xqsTCTfaVed4E0 - test

playlist_id = '6meRpNHvKC1VcBl5MIbVxo'
playlist_length = sp.playlist_items(playlist_id)['total']

track_ids = []
date_added = {} #Tracks when a song was added to a playlist

#sp.playlist_track has a limit of 100 items per API call
for i in range(playlist_length // 100 + 1):
    playlist = sp.playlist_tracks(playlist_id, market = 'CAN',fields='items', limit=100, offset = i * 100 )
    #playlist[items] has information on all the tracks in the playlist
    for index, track in enumerate(playlist['items']):
        if track['track'] is not None and track['track']['id'] is not None:
            date_added[track['track']['id']] = playlist['items'][index]['added_at']
            track_ids.append(track['track']['id'])

In [5]:
artist_genres = {}

playlist_track_audio_features = []

#
for i in range(0,len(track_ids), 100):
    playlist_track_audio_features.extend(sp.audio_features(track_ids[i : i + 100]))

playlist_track_info = []
for i in range(0, len(track_ids), 50):
    playlist_track_info.extend(sp.tracks(track_ids[i : i + 50])['tracks'])

In [6]:
track_audio_features = [audio_feature for audio_feature in playlist_track_audio_features]
track_info = [info for info in playlist_track_info]
track_artist_ids = [info['artists'][0]['id'] for info in playlist_track_info]

In [7]:

for i in range(0, len(track_artist_ids), 50):
    results = sp.artists(track_artist_ids[i: i + 50])
    for artist in results['artists']:
        #Only get the first (main) genre from an artist as artist's have multiple genres
        artist_genres[artist['name']] = artist_genres.get(artist['name'], artist['genres'][0] if artist['genres'] else None)

In [8]:
for index ,id in enumerate(track_ids):
    track_artists = [artist['name'] for artist in track_info[index]['artists']]
    main_artist_genre = artist_genres[track_artists[0]]
    audio_features_keys = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
    audio_data = {
        'track_id': id,
        'artists': ';'.join(track_artists),
        'album_name': track_info[index]['album']['name'],
        'track_name': track_info[index]['name'],
        'popularity': track_info[index]['popularity'],
        'explicit': track_info[index]['explicit'],
        **{key: track_audio_features[index][key] for key in audio_features_keys},
        'track_genre': main_artist_genre
    }
    df.loc[len(df.index)] = audio_data

In [9]:
df = feature_engineer(df)

In [10]:
track_data = df.tail(playlist_length - 1).copy()
df = df.iloc[:-playlist_length - 1]
track_data['date_added'] = date_added.values()


In [11]:
# Assuming track_data is already defined and feature_engineered
track_data = pd.DataFrame(track_data)
track_data['date_added'] = pd.to_datetime(track_data['date_added']).dt.date
first_date = track_data.iloc[0]['date_added']
last_date = track_data.iloc[-1]['date_added']

# Calculate total months difference for normalization
total_months = relativedelta(last_date, first_date).months + 1

# Calculate weights using linear descent
weights = []
for index, row in track_data.iterrows():
    months_difference = relativedelta(row['date_added'], first_date).months
    # Linear descent weighting
    weights.append(round(1/(months_difference + 1), 3))

track_data['weight'] = weights
track_data = track_data.drop(columns=['date_added'])


In [12]:
start_index = 0
end_index = 1
num_rows = len(track_data.values)
num_cols = len(track_data.columns)
results_array = [0] * (num_cols - 5)
for i in range(num_rows):
    row = track_data.iloc[i]
    row_weight = track_data.iloc[i, -1]
    for index, value in enumerate(row[4:-1]):
        results_array[index] += value * row_weight


In [13]:
vectors = df.iloc[:, 4:].values
input_vector = np.array(results_array)
similarity_matrix = cosine_similarity(input_vector.reshape(1,-1), vectors)

similarity_array = similarity_matrix.flatten()

In [14]:
top_20_songs = set()
counter = 0
while len(top_20_songs) < 20:
    ith_most_similar_index = similarity_array.argsort()[::-1][counter]
    track_name = df.iloc[ith_most_similar_index]['track_name']
    artist = df.iloc[ith_most_similar_index]['artists']
    top_20_songs.add(track_name + " | " + artist)
    counter += 1

In [15]:
for track in top_20_songs:
    print(track)

Se Menea | Don Omar;Nio Garcia
Solid | Burna Boy;Blxst;Kehlani
Usain Boo | Kodak Black
Pursuit Of Happiness - Extended Steve Aoki Remix | Kid Cudi;MGMT;Ratatat;Steve Aoki
Clout | Offset;Cardi B
Your Turn | Ty Dolla $ign;Musiq Soulchild;Tish Hyman;6LACK
Real Life | Burna Boy;Stormzy
WORKIN ME | Quavo
RAP DEVIL | Machine Gun Kelly
Banking on Me | Gunna
Polo Jeans | Mac Miller;Earl Sweatshirt
Save You | Kodak Black
Me and Klik | Kodak Black;John Wicks;Jackboy
Ball If I Want To | DaBaby
bloody valentine | Machine Gun Kelly
Codeine Dreaming | Kodak Black;Lil Wayne
Gave It | Moneybagg Yo;Big Homiie G
Bezerk | Big Sean;Hit-Boy;A$AP Ferg
Perreito | Mariah Angeliq
Grrrls | Lizzo
