### (0) IMPORT

In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from spotipy.oauth2 import SpotifyClientCredentials
from bs4 import BeautifulSoup
from selenium import webdriver
from sklearn.cluster import KMeans
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from kneed import KneeLocator
import time
import spotipy
import pandas as pd

#BORROWING KAISHENGS CLIENT SECRET AND ID THANKS KS BB PLS DONT DISABLE YA
auth_manager = SpotifyClientCredentials(client_id="21ff73a9b5a94ea8b3a969b906baead1", client_secret="3761e7947ef542149467196a07cf2563")
sp = spotipy.Spotify(auth_manager=auth_manager)

### (1) FUNCTIONS


In [2]:
def ohe_prep(df, column, new_name): 
    ''' 
    Create One Hot Encoded features of a specific column
    ---
    Input: 
    df (pandas dataframe): Spotify Dataframe
    column (str): Column to be processed
    new_name (str): new column name to be used
        
    Output: 
    tf_df: One-hot encoded features 
    '''
    
    tf_df = pd.get_dummies(df[column])
    
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)  
    return tf_df

def get_features_database(databaseDF, feature_weights):
    #Select Features
    databaseDF = databaseDF[["id","songName",
                "danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
                "valence","tempo","type"]]

    #OHE Features
    key_ohe = ohe_prep(databaseDF, 'key','key') * feature_weights['key']
    mode_ohe = ohe_prep(databaseDF, 'mode','mode') * feature_weights['mode']

    ##Normalise/Scale Audio Columns
    float_cols = databaseDF.dtypes[databaseDF.dtypes == 'float64'].index.values
    floats = databaseDF[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns)

    # Apply weight to each float column
    for col in floats_scaled.columns:
        floats_scaled[col] *= feature_weights.get(col, 1.0)

    ##Combine all Features
    final = pd.concat([floats_scaled, key_ohe, mode_ohe, databaseDF["type"]], axis = 1)
    return final


def generate_rec(databaseDF, database_vector, user_vector):
    #Cosine Similarity
    databaseDF["sim"] = cosine_similarity(database_vector,user_vector)
    
    #Remove sim = 1 as it means its the same song
    databaseDF.drop(databaseDF[databaseDF['sim'] >= 1].index, inplace = True)

    rec_top5 = databaseDF.sort_values('sim',ascending = False).head(5)
    return rec_top5

### (2) Extract Both User and Dataset Csv

In [4]:
#Extract Database CSV
databaseDF = pd.read_csv("spotify_dataset_eda.csv", encoding="utf_8_sig")
databaseDF = databaseDF.drop(columns=['Unnamed: 0'])
#Selects columns that we want
databaseDF = databaseDF[["id","songName",
        "danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
        "valence","tempo",]]


#Extract User Playlist CSV
playlistDF = pd.read_csv("User_Playlist.csv", encoding="utf_8_sig")
playlistDF = playlistDF.drop(columns=['Unnamed: 0'])
#Reset Index
playlistDF = playlistDF.reset_index(drop = True)  
#Selects columns that we want
playlistDF = playlistDF[["id","songName",
        "danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
        "valence","tempo",]] 



### Run this for weighted

In [None]:
feature_weight = {
    "danceability": 0.6,
    "energy": 0.7,
    "key": 0.1,
    "loudness": 0.8,
    "mode": 0.1,
    "speechiness": 0.5,
    "acousticness": 0.3,
    "instrumentalness": 0.5,
    "liveness": 0.3,
    "valence": 0.7,
    "tempo": 0.6,
    "type": 0.1
}

### Run this for non weighted

In [5]:
feature_weight = {
    "danceability": 1,
    "energy": 1,
    "key": 1,
    "loudness": 1,
    "mode": 1,
    "speechiness": 1,
    "acousticness": 1,
    "instrumentalness": 1,
    "liveness": 1,
    "valence": 1,
    "tempo": 1,
    "type": 1
}

### Rec songs

In [7]:
#Merge user + dataset dataframe to normalise 
#Normalise takes min and max in dataframe as reference and change it to 0 and 1 respectively

#Group the dataframe as we gonna split it again alter
databaseDF["type"] = "Dataset"
playlistDF["type"] = "User"


#Check for duplicates in database
databaseDF = databaseDF.drop_duplicates(ignore_index= True)

#Merge the 2 datasets together
combinedDF = pd.concat([databaseDF,playlistDF], ignore_index=True)


##Normalise and get Vectors for Dataset + User
normalised_vector = get_features_database(combinedDF,feature_weight)

##Seperate User from databaseDF 
database_vector = normalised_vector[normalised_vector["type"] == "Dataset"]
user_vector = normalised_vector[normalised_vector["type"] == "User"]

#Drop "type" column
database_vector = database_vector.drop(columns="type")
user_vector = user_vector.drop(columns="type")
databaseDF = databaseDF.drop(columns=["type"])


##Single Vector Creation
final_user_vector_list = []
for i in user_vector.columns:
    final_user_vector_list.append(user_vector[i].sum()/len(user_vector[i]))

#Putting into a vector dataframe
final_user_vector = pd.DataFrame(columns=user_vector.columns,)
final_user_vector.loc[0] = final_user_vector_list

##Generate Recc Songs
result = generate_rec(databaseDF,database_vector,final_user_vector)
result

result['id'] = result['id'].apply(lambda x: 'https://open.spotify.com/track/' + str(x))
# Convert the 'id' column to HTML hyperlinks
pd.set_option('display.max_colwidth', None)


Unnamed: 0,id,songName,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumental,liveness,valence,tempo,sim
212146,1yPkYnRgZveWLCUSWX16ra,HAPPY HAPPY,0.609,0.932,10,-1.562,1,0.145,0.173,0.000946,0.396,0.755,179.995,0.872782
154217,1GQP3AKkB2KFn4KCeFBPaJ,BOMB (Feat. San E),0.778,0.938,10,-2.959,1,0.225,0.306,0.0,0.363,0.739,145.022,0.871953
171125,5EUhnFBEuuUZJfKCtDnriX,Draw 4 That,0.643,0.976,10,-0.559,1,0.297,0.000791,0.00198,0.595,0.686,174.009,0.87031
509926,3ACzWKB7R56sGDzEa3ZQH8,Die Pure Lust am Leben,0.717,1.0,10,-2.877,1,0.0769,0.0176,0.000123,0.302,0.688,142.019,0.869735
535379,1iYyTTPydzJmvnmgwvO3w5,Die pure Lust am Leben - Sommer Version,0.711,1.0,10,-1.586,1,0.0705,0.0376,1e-06,0.196,0.737,142.007,0.869385
