In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

FUNCTIONSSS

In [3]:
def ohe_prep(df, column, new_name): 
    ''' 
    Create One Hot Encoded features of a specific column
    ---
    Input: 
    df (pandas dataframe): Spotify Dataframe
    column (str): Column to be processed
    new_name (str): new column name to be used
        
    Output: 
    tf_df: One-hot encoded features 
    '''
    
    tf_df = pd.get_dummies(df[column])
    
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)  
    return tf_df


def generate_rec(databaseDF, database_vector, user_vector):
    #Cosine Similarity
    databaseDF["sim"] = cosine_similarity(database_vector,user_vector)
    
    #Remove sim = 1 as it means its the same song
    databaseDF.drop(databaseDF[databaseDF['sim'] >= 1].index, inplace = True)

    rec_top5 = databaseDF.sort_values('sim',ascending = False).head(5)
    return rec_top5

WEIGHTAGE!

In [26]:
def get_features_database(databaseDF, feature_weights):
    #Select Features
    databaseDF = databaseDF[["id","songName",
                "danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
                "valence","tempo","type"]]

    #OHE Features
    key_ohe = ohe_prep(databaseDF, 'key','key') * feature_weights['key']
    mode_ohe = ohe_prep(databaseDF, 'mode','mode') * feature_weights['mode']

    ##Normalise/Scale Audio Columns
    float_cols = databaseDF.dtypes[databaseDF.dtypes == 'float64'].index.values
    floats = databaseDF[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns)

    # Apply weight to each float column
    for col in floats_scaled.columns:
        floats_scaled[col] *= feature_weights.get(col, 1.0)

    ##Combine all Features
    final = pd.concat([floats_scaled, key_ohe, mode_ohe, databaseDF["type"]], axis = 1)
    return final

#RUN THE STUFF BELOW (in indiv cells)
# feature_weight = {
#     "danceability": 0.8,
#     "energy": 0.7,
#     "key": 0.1,
#     "loudness": 0.6,
#     "mode": 0.1,
#     "speechiness": 0.1,
#     "acousticness": 0.1,
#     "instrumental": 0.6,
#     "liveness": 0.1,
#     "valence": 0.7,
#     "tempo": 0.5,
# }

Extract Both User and Dataset Csv

In [None]:
# #Extract Database CSV
# databaseDF = pd.read_csv("Final_RDS.csv", encoding="utf_8_sig")
# databaseDF = databaseDF.drop(columns=['Unnamed: 0'])
# #Selects columns that we want
# databaseDF = databaseDF[["id","songName", "artist",
#         "danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
#         "valence","tempo",]]


In [9]:
#Extract User Playlist CSV
playlistDF = pd.read_csv("User_Playlist.csv", encoding="utf_8_sig")
# playlistDF = playlistDF.drop(columns=['Unnamed: 0'])
#Reset Index
playlistDF = playlistDF.reset_index(drop = True)  
#Selects columns that we want
playlistDF = playlistDF[["id","songName", "albumName", "artist", 
        "danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
        "valence","tempo",]] 



Normalise Features

In [10]:
#Merge user + dataset dataframe to normalise 
# #Normalise takes min and max in dataframe as reference and change it to 0 and 1 respectively

# #Group the dataframe as we gonna split it again alter
# databaseDF["type"] = "Dataset"
# playlistDF["type"] = "User"


# #Check for duplicates in database
# databaseDF = databaseDF.drop_duplicates(ignore_index= True)

# #Merge the 2 datasets together
# combinedDF = pd.concat([databaseDF,playlistDF], ignore_index=True)


# ##Normalise and get Vectors for Dataset + User
# normalised_vector = get_features_database(combinedDF, feature_weight)

# ##Seperate User from databaseDF 
# database_vector = normalised_vector[normalised_vector["type"] == "Dataset"]
# user_vector = normalised_vector[normalised_vector["type"] == "User"]

# #Drop "type" column
# database_vector = database_vector.drop(columns="type")
# user_vector = user_vector.drop(columns="type")
# databaseDF = databaseDF.drop(columns=["type"])

## Merge all User Vector to form single vector

### FOR COMPARING CLUSTER VS NO CLUSTER (IRRELEVANT FOR THIS)

In [11]:
##Single Vector Creation
final_user_vector_list = []
for i in user_vector.columns:
    final_user_vector_list.append(user_vector[i].sum()/len(user_vector[i]))

#Putting into a vector dataframe
final_user_vector = pd.DataFrame(columns=user_vector.columns,)
final_user_vector.loc[0] = final_user_vector_list



### RECOMMEND TIME!

check out the following feature weightage

In [20]:
#Rec for "Red Rum"
#just featyure_weight1 after normalisation
feature_weight1 = {
    "danceability": 0.8,
    "energy": 0.7,
    "key": 0.1,
    "loudness": 0.6,
    "mode": 0.1,
    "speechiness": 0.1,
    "acousticness": 0.1,
    "instrumental": 0.6,
    "liveness": 0.1,
    "valence": 0.7,
    "tempo": 0.5,
}

#Group the dataframe as we gonna split it again alter
databaseDF["type"] = "Dataset"
playlistDF["type"] = "User"
#Check for duplicates in database
databaseDF = databaseDF.drop_duplicates(ignore_index= True)
#Merge the 2 datasets together
combinedDF = pd.concat([databaseDF,playlistDF], ignore_index=True)
##Normalise and get Vectors for Dataset + User
normalised_vector = get_features_database(combinedDF, feature_weight1)
##Seperate User from databaseDF 
database_vector = normalised_vector[normalised_vector["type"] == "Dataset"]
user_vector = normalised_vector[normalised_vector["type"] == "User"]
#Drop "type" column
database_vector = database_vector.drop(columns="type")
user_vector = user_vector.drop(columns="type")
databaseDF = databaseDF.drop(columns=["type"])


fin_feat=["id", "artist","songName", "genre","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness", "valence","tempo"]
result = generate_rec(databaseDF,database_vector,final_user_vector)
com_res = pd.concat([playlistDF, result], ignore_index=True)[fin_feat]
com_res

Unnamed: 0,id,artist,songName,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumental,liveness,valence,tempo
0,52eIcoLUM25zbQupAZYoFh,21 Savage,redrum,,0.624,0.74,2,-8.445,1,0.0481,0.00529,0.000224,0.5,0.246,172.089
1,6MK98D2xaDhemWLxjDe0n2,The Lacs,American Rebelution,country,0.596,0.708,2,-7.761,1,0.14,0.0138,2e-06,0.487,0.245,156.099
2,51zzRom8YK6scBWRbdyA7f,CHXPO,Absolutely Not,rock,0.631,0.688,2,-9.32,1,0.199,0.0218,0.0,0.334,0.206,161.047
3,4wBJJA2whqsFWJhMSq38f9,Dr. Fresch,Timeless,house,0.638,0.782,2,-5.051,1,0.277,0.0816,0.0,0.114,0.27,180.032
4,0hLnS9I4pa1sNOAFiQvyNW,Indivision,Insommnia - Boxplot Remix,electro,0.657,0.82,2,-5.567,1,0.0489,0.00212,0.0114,0.115,0.221,173.97
5,16IOzE4Kxyoi7Yr5O0PMws,Steven Moses,Pain You Keep,rock,0.63,0.734,2,-5.979,1,0.169,0.188,0.0,0.145,0.26,159.972


In [23]:
#Rec for "Red Rum"
#equal weightage after normalisation
equal_weights = {
    "danceability": 1.0,
    "energy": 1.0,
    "key": 1.0,
    "loudness": 1.0,
    "mode": 1.0,
    "speechiness": 1.0,
    "acousticness": 1.0,
    "instrumental": 1.0,
    "liveness": 1.0,
    "valence": 1.0,
    "tempo": 1.0,
}

#Group the dataframe as we gonna split it again alter
databaseDF["type"] = "Dataset"
playlistDF["type"] = "User"
#Check for duplicates in database
databaseDF = databaseDF.drop_duplicates(ignore_index= True)
#Merge the 2 datasets together
combinedDF = pd.concat([databaseDF,playlistDF], ignore_index=True)
##Normalise and get Vectors for Dataset + User
normalised_vector = get_features_database(combinedDF, equal_weights)
##Seperate User from databaseDF 
database_vector = normalised_vector[normalised_vector["type"] == "Dataset"]
user_vector = normalised_vector[normalised_vector["type"] == "User"]
#Drop "type" column
database_vector = database_vector.drop(columns="type")
user_vector = user_vector.drop(columns="type")
databaseDF = databaseDF.drop(columns=["type"])

fin_feat=["id", "artist","songName", "genre","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness", "valence","tempo"]
result = generate_rec(databaseDF,database_vector,final_user_vector)
com_res = pd.concat([playlistDF, result], ignore_index=True)[fin_feat]
com_res

# import seaborn as sns
# import matplotlib.pyplot as plt

# # Extract the numerical features for EDA
# numeric_features = ["danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumental", "liveness", "valence", "tempo"]

# # Set up subplots
# fig, axes = plt.subplots(nrows=len(numeric_features), ncols=1, figsize=(8, 4 * len(numeric_features)))
# fig.suptitle('Exploratory Data Analysis for Numerical Features', y=1.02)

# # Plot distribution for each numerical feature
# for i, feature in enumerate(numeric_features):
#     sns.histplot(com_res[feature], kde=True, ax=axes[i])
#     axes[i].set_title(f'Distribution of {feature}')
#     axes[i].set_xlabel(feature)

# plt.tight_layout()
# plt.show()

Unnamed: 0,id,artist,songName,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumental,liveness,valence,tempo
0,52eIcoLUM25zbQupAZYoFh,21 Savage,redrum,,0.624,0.74,2,-8.445,1,0.0481,0.00529,0.000224,0.5,0.246,172.089
1,1HJU78CRk4vxvjE5Cs1BCt,Dj Guuga,Vidrado Em Você,hip hop,0.844,0.925,2,0.303,1,0.1,0.14,8e-06,0.109,0.589,159.929
2,4FqfmTZJdYB40bFHpzlxDg,TJR,Polluted (Taito Remix),rock,0.88,0.996,2,-3.695,1,0.141,0.0702,0.0573,0.125,0.349,128.02
3,6J5yPwIpKrE6KBlpfgf87u,Mc Delux,Ai Ai Ai Calma,hip hop,0.939,0.982,2,1.751,1,0.0753,0.382,6.7e-05,0.15,0.687,130.011
4,31IIUscG9frMHRzlnwFD2z,MC Mazzie,Bate na Minha Bunda,hip hop,0.944,0.871,2,-2.31,1,0.0573,0.0815,0.121,0.242,0.397,130.028
5,71AN3TvZYttyaU3wfcKRYu,Fast Eddie,I Believe - Jez Pereira & Madoc Instrumental R...,house,0.837,0.964,2,-2.668,1,0.0605,0.000494,0.0501,0.111,0.387,125.976


In [24]:
#Rec for "Red Rum"
#feature_weight2 after normalisation
feature_weight2 = {
    "danceability": 0.8,
    "energy": 0.9,
    "key": 0.2,
    "loudness": 0.7,
    "mode": 0.2,
    "speechiness": 0.3,
    "acousticness": 0.4,
    "instrumental": 0.8,
    "liveness": 0.3,
    "valence": 0.9,
    "tempo": 0.6,
}

#Group the dataframe as we gonna split it again alter
databaseDF["type"] = "Dataset"
playlistDF["type"] = "User"
#Check for duplicates in database
databaseDF = databaseDF.drop_duplicates(ignore_index= True)
#Merge the 2 datasets together
combinedDF = pd.concat([databaseDF,playlistDF], ignore_index=True)
##Normalise and get Vectors for Dataset + User
normalised_vector = get_features_database(combinedDF, feature_weight2)
##Seperate User from databaseDF 
database_vector = normalised_vector[normalised_vector["type"] == "Dataset"]
user_vector = normalised_vector[normalised_vector["type"] == "User"]
#Drop "type" column
database_vector = database_vector.drop(columns="type")
user_vector = user_vector.drop(columns="type")
databaseDF = databaseDF.drop(columns=["type"])

fin_feat=["id", "artist","songName", "genre","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness", "valence","tempo"]
result = generate_rec(databaseDF,database_vector,final_user_vector)
com_res = pd.concat([playlistDF, result], ignore_index=True)[fin_feat]
com_res

Unnamed: 0,id,artist,songName,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumental,liveness,valence,tempo
0,52eIcoLUM25zbQupAZYoFh,21 Savage,redrum,,0.624,0.74,2,-8.445,1,0.0481,0.00529,0.000224,0.5,0.246,172.089
1,6HdAtWhm4hCCZbYYZvGdF1,Bostich,El Vergel,electro,0.789,0.671,2,-7.599,1,0.0477,0.00448,0.00136,0.0951,0.242,160.018
2,1hHbmfNfXIwHyHCYO8ae3I,Nicki Minaj,Hard White,r&b,0.763,0.736,2,-3.403,1,0.212,0.08,0.0272,0.145,0.273,173.952
3,3TtBVJSuxXe0NPha0lhfvx,Young Buck,Taking Hits,techno,0.811,0.788,2,-5.494,1,0.12,0.0587,0.0,0.278,0.318,159.861
4,3ciiXrEhTyuo3wAFHBkHy3,Birdman,Out The Pound,hip hop,0.737,0.707,2,-4.85,1,0.181,0.0134,0.0,0.328,0.27,160.016
5,4QPDhJZOme7hpreUuOSLWE,Mike Posner,Be As You Are - JordanXL Remix,r&b,0.67,0.686,2,-5.486,1,0.0509,0.062,0.0,0.216,0.208,170.037


In [25]:
#Rec for "Red Rum"
#feature_weightage3 after normalisation
feature_weight3 = {
    "danceability": 0.8,
    "energy": 0.7,
    "key": 0.1,
    "loudness": 0.6,
    "mode": 0.1,
    "speechiness": 0.1,
    "acousticness": 0.2,
    "instrumental": 0.6,
    "liveness": 0.1,
    "valence": 0.7,
    "tempo": 0.5,
}

#Group the dataframe as we gonna split it again alter
databaseDF["type"] = "Dataset"
playlistDF["type"] = "User"
#Check for duplicates in database
databaseDF = databaseDF.drop_duplicates(ignore_index= True)
#Merge the 2 datasets together
combinedDF = pd.concat([databaseDF,playlistDF], ignore_index=True)
##Normalise and get Vectors for Dataset + User
normalised_vector = get_features_database(combinedDF, feature_weight3)
##Seperate User from databaseDF 
database_vector = normalised_vector[normalised_vector["type"] == "Dataset"]
user_vector = normalised_vector[normalised_vector["type"] == "User"]
#Drop "type" column
database_vector = database_vector.drop(columns="type")
user_vector = user_vector.drop(columns="type")
databaseDF = databaseDF.drop(columns=["type"])

fin_feat=["id", "artist","songName", "genre","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness", "valence","tempo"]
result = generate_rec(databaseDF,database_vector,final_user_vector)
com_res = pd.concat([playlistDF, result], ignore_index=True)[fin_feat]
com_res

Unnamed: 0,id,artist,songName,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumental,liveness,valence,tempo
0,52eIcoLUM25zbQupAZYoFh,21 Savage,redrum,,0.624,0.74,2,-8.445,1,0.0481,0.00529,0.000224,0.5,0.246,172.089
1,6MK98D2xaDhemWLxjDe0n2,The Lacs,American Rebelution,country,0.596,0.708,2,-7.761,1,0.14,0.0138,2e-06,0.487,0.245,156.099
2,51zzRom8YK6scBWRbdyA7f,CHXPO,Absolutely Not,rock,0.631,0.688,2,-9.32,1,0.199,0.0218,0.0,0.334,0.206,161.047
3,4wBJJA2whqsFWJhMSq38f9,Dr. Fresch,Timeless,house,0.638,0.782,2,-5.051,1,0.277,0.0816,0.0,0.114,0.27,180.032
4,0hLnS9I4pa1sNOAFiQvyNW,Indivision,Insommnia - Boxplot Remix,electro,0.657,0.82,2,-5.567,1,0.0489,0.00212,0.0114,0.115,0.221,173.97
5,5nqsDE9UBnIgmH7eaqmOWX,Poshlaya Molly,Клеопатри,electro,0.609,0.701,2,-7.492,1,0.039,0.000497,6.1e-05,0.175,0.285,157.12
