In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

FUNCTIONSSS

In [2]:
def ohe_prep(df, column, new_name): 
    ''' 
    Create One Hot Encoded features of a specific column
    ---
    Input: 
    df (pandas dataframe): Spotify Dataframe
    column (str): Column to be processed
    new_name (str): new column name to be used
        
    Output: 
    tf_df: One-hot encoded features 
    '''
    
    tf_df = pd.get_dummies(df[column])
    
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)  
    return tf_df

def get_features_database(databaseDF):
    #Select Features
    databaseDF = databaseDF[["id","songName",
                "danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
                "valence","tempo","type"]]

    #OHE Features
    key_ohe = ohe_prep(databaseDF, 'key','key') * 0.5
    mode_ohe = ohe_prep(databaseDF, 'mode','mode') * 0.5

    ##Normalise/Scale Audio Columns
    float_cols = databaseDF.dtypes[databaseDF.dtypes == 'float64'].index.values
    floats = databaseDF[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    ##Combine all Features
    final = pd.concat([floats_scaled, key_ohe, mode_ohe, databaseDF["type"]], axis = 1)
    return final

def generate_rec(databaseDF, database_vector, user_vector, genre_top3):
    #Cosine Similarity
    databaseDF["sim"] = cosine_similarity(database_vector,user_vector)
    
    #Drop rows with different genre from top 3 genres
    databaseDF = databaseDF[(databaseDF["genre"] == genre_top3[0]) | (databaseDF["genre"] == genre_top3[1]) | (databaseDF["genre"] == genre_top3[2])]
    #Sort and recommend top 5 with same genres
    rec_top5 = databaseDF.sort_values('sim',ascending = False).head()
    return rec_top5

Changing Genre Names from UserPlaylist(Spotify API Genre Names) to Dataset Genre Names

In [3]:
df = pd.read_csv("spotify_data_cleaned.csv")
df = df.drop(columns=["Unnamed: 0"])
df = df.dropna()
genres_list = []
for i in df["genre"]:
    if i not in genres_list:
        genres_list.append(i)

In [4]:
userDF = pd.read_csv("User_Playlist.csv")
userDF = userDF.drop(columns=["Unnamed: 0"])

song_genres_list = []
for song_genres in userDF["genre"]:
    one_song_genre = []
    for genre in genres_list:
        if genre in song_genres:
            one_song_genre.append(genre)
        elif " " in genre:
            for word in genre.split(" "):
                if word in song_genres:
                    one_song_genre.append(genre)
    song_genres_list.append(one_song_genre)

userDF["genre"] = song_genres_list
userDF

Unnamed: 0,id,songName,albumName,albumID,artist,genre,explicit,releaseDate,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumental,liveness,valence,tempo,duration_ms,time_signature
0,4RGWHfQeJftd5XrP8JUgFj,Metamodernity,Metamodernity,1lgZZELIZZwafQJDycr47q,Vansire,"[indie pop, indie pop, pop, power pop, rock n ...",False,2019-06-11,0.823,0.544,...,-10.077,1,0.0358,0.389,0.165,0.108,0.826,121.968,162663,4


Find Top 3 Genres in User Playlist By Countign Frequency of Genres

In [5]:
exploded_df = userDF.explode('genre')

freq = exploded_df["genre"].value_counts(sort=True)

genres_count = dict(freq.head(3))
genre_top3 = list(genres_count.keys())
print(genre_top3)

['indie pop', 'pop', 'power pop']


Extract Both User and Dataset Csv

In [6]:
#Extract Database CSV
databaseDF = pd.read_csv("spotify_data_cleaned.csv", encoding="utf_8_sig")
databaseDF = databaseDF.drop(columns=['Unnamed: 0'])
#Selects columns that we want
databaseDF = databaseDF[["id","artist","songName", "genre","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
        "valence","tempo",]]

#Create new feature/column of artist_songName
databaseDF["artist_songName"] = databaseDF["artist"] + "_" + databaseDF["songName"]

#Check for duplicates in database
databaseDF = databaseDF.drop_duplicates(subset=["artist_songName"],ignore_index= True)

In [7]:
#Extract User Playlist CSV
playlistDF = pd.read_csv("User_Playlist.csv", encoding="utf_8_sig")

playlistDF = playlistDF.drop(columns=['Unnamed: 0'])
#Reset Index
playlistDF = playlistDF.reset_index(drop = True)  
#Selects columns that we want
playlistDF = playlistDF[["id","artist","songName","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
        "valence","tempo",]] 

#Create new feature/column of artist_songName
playlistDF["artist_songName"] = playlistDF["artist"] + "_" + playlistDF["songName"]

#Check for duplicates in user Playlist
playlistDF = playlistDF.drop_duplicates(subset=["artist_songName"],ignore_index= True)

Normalise Features

In [8]:
#Merge user + dataset dataframe to normalise 
#Normalise takes min and max in dataframe as reference and change it to 0 and 1 respectively

#Group the dataframe as we gonna split it again alter
databaseDF["type"] = "Dataset"
playlistDF["type"] = "User"


#Merge the 2 datasets together
combinedDF = pd.concat([databaseDF,playlistDF], ignore_index=True)

#Check for duplicates between user and Database
combinedDF = combinedDF.drop_duplicates(subset=["artist_songName"], keep="last",ignore_index= True)

##Update databaseDF with removed songs from user Playlist
databaseDF = combinedDF[(combinedDF["type"]== "Dataset")]

##Normalise and get Vectors for Dataset + User
normalised_vector = get_features_database(combinedDF)

##Seperate User from databaseDF 
database_vector = normalised_vector[normalised_vector["type"] == "Dataset"]
user_vector = normalised_vector[normalised_vector["type"] == "User"]

#Drop "type" column
database_vector = database_vector.drop(columns="type")
user_vector = user_vector.drop(columns="type")
databaseDF = databaseDF.drop(columns=["type"])


Merge all User Vector to form single vector

In [9]:
##Single Vector Creation
final_user_vector_list = []
for i in user_vector.columns:
    final_user_vector_list.append(user_vector[i].sum()/len(user_vector[i]))

#Putting into a vector dataframe
final_user_vector = pd.DataFrame(columns=user_vector.columns,)
final_user_vector.loc[0] = final_user_vector_list



RECOMMEND TIME!

In [10]:
##Generate Recc Songs
result = generate_rec(databaseDF,database_vector,final_user_vector, genre_top3)
result

Unnamed: 0,id,artist,songName,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumental,liveness,valence,tempo,artist_songName,sim
639443,02tLCrTnAQ5feS5oRoUjmJ,Daniele Silvestri,Testardo,indie pop,0.816,0.547,5,-8.552,1,0.0531,0.455,0.0,0.083,0.823,112.032,Daniele Silvestri_Testardo,0.998851
562020,1gH1h30wkQdd9zhY3j7a8T,Rex Orange County,THE SHADE,pop,0.898,0.509,5,-7.973,1,0.062,0.391,0.0,0.0876,0.727,119.869,Rex Orange County_THE SHADE,0.998469
644615,4KoecuyOpZaNFZ0UqVsllc,Uncle Kracker,Follow Me,pop,0.817,0.585,5,-4.688,1,0.0301,0.439,0.0,0.147,0.916,105.014,Uncle Kracker_Follow Me,0.998305
639371,5DgWRzfucGw1TcZlnIAGdu,Cheer Chen,下午三點,indie pop,0.747,0.616,5,-7.865,1,0.0261,0.282,0.0,0.101,0.873,119.987,Cheer Chen_下午三點,0.998233
502416,78NO8ptHSWTIAmjtZHxC7e,Zeph,miss me,indie pop,0.699,0.57,5,-10.277,1,0.0291,0.534,0.0974,0.0871,0.808,150.973,Zeph_miss me,0.998143
