### (0) IMPORT

In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from spotipy.oauth2 import SpotifyClientCredentials
from bs4 import BeautifulSoup
from selenium import webdriver
from sklearn.cluster import KMeans
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from kneed import KneeLocator
import time
import spotipy
import pandas as pd

#BORROWING KAISHENGS CLIENT SECRET AND ID THANKS KS BB PLS DONT DISABLE YA
auth_manager = SpotifyClientCredentials(client_id="21ff73a9b5a94ea8b3a969b906baead1", client_secret="3761e7947ef542149467196a07cf2563")
sp = spotipy.Spotify(auth_manager=auth_manager)

### (1) FUNCTIONS


In [2]:
def ohe_prep(df, column, new_name): 
    ''' 
    Create One Hot Encoded features of a specific column
    ---
    Input: 
    df (pandas dataframe): Spotify Dataframe
    column (str): Column to be processed
    new_name (str): new column name to be used
        
    Output: 
    tf_df: One-hot encoded features 
    '''
    
    tf_df = pd.get_dummies(df[column])
    
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)  
    return tf_df

def get_features_database(databaseDF, feature_weights):
    #Select Features
    databaseDF = databaseDF[["id","songName",
                "danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
                "valence","tempo","type"]]

    #OHE Features
    key_ohe = ohe_prep(databaseDF, 'key','key') * feature_weights['key']
    mode_ohe = ohe_prep(databaseDF, 'mode','mode') * feature_weights['mode']

    ##Normalise/Scale Audio Columns
    float_cols = databaseDF.dtypes[databaseDF.dtypes == 'float64'].index.values
    floats = databaseDF[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns)

    # Apply weight to each float column
    for col in floats_scaled.columns:
        floats_scaled[col] *= feature_weights.get(col, 1.0)

    ##Combine all Features
    final = pd.concat([floats_scaled, key_ohe, mode_ohe, databaseDF["type"]], axis = 1)
    return final


feature_weight = {
    "danceability": 0.6,
    "energy": 0.7,
    "key": 0.1,
    "loudness": 0.8,
    "mode": 0.1,
    "speechiness": 0.5,
    "acousticness": 0.3,
    "instrumentalness": 0.5,
    "liveness": 0.3,
    "valence": 0.7,
    "tempo": 0.6,
    "type": 0.1
}

def generate_rec(databaseDF, database_vector, user_vector):
    #Cosine Similarity
    databaseDF["sim"] = cosine_similarity(database_vector,user_vector)
    
    #Remove sim = 1 as it means its the same song
    databaseDF.drop(databaseDF[databaseDF['sim'] >= 1].index, inplace = True)

    rec_top5 = databaseDF.sort_values('sim',ascending = False).head(5)
    return rec_top5

### Only weightage

In [3]:
#Extract Database CSV
databaseDF = pd.read_csv("spotify_dataset_eda.csv", encoding="utf_8_sig")
databaseDF = databaseDF.drop(columns=['Unnamed: 0'])
#Selects columns that we want
databaseDF = databaseDF[["id","songName",
        "danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
        "valence","tempo",]]


#Extract User Playlist CSV
playlistDF = pd.read_csv("User_Playlist.csv", encoding="utf_8_sig")
playlistDF = playlistDF.drop(columns=['Unnamed: 0'])
#Reset Index
playlistDF = playlistDF.reset_index(drop = True)  
#Selects columns that we want
playlistDF = playlistDF[["id","songName",
        "danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
        "valence","tempo",]] 




#Merge user + dataset dataframe to normalise 
#Normalise takes min and max in dataframe as reference and change it to 0 and 1 respectively

#Group the dataframe as we gonna split it again alter
databaseDF["type"] = "Dataset"
playlistDF["type"] = "User"


#Check for duplicates in database
databaseDF = databaseDF.drop_duplicates(ignore_index= True)

#Merge the 2 datasets together
combinedDF = pd.concat([databaseDF,playlistDF], ignore_index=True)


##Normalise and get Vectors for Dataset + User
normalised_vector = get_features_database(combinedDF,feature_weight)

##Seperate User from databaseDF 
database_vector = normalised_vector[normalised_vector["type"] == "Dataset"]
user_vector = normalised_vector[normalised_vector["type"] == "User"]

#Drop "type" column
database_vector = database_vector.drop(columns="type")
user_vector = user_vector.drop(columns="type")
databaseDF = databaseDF.drop(columns=["type"])


##Single Vector Creation
final_user_vector_list = []
for i in user_vector.columns:
    final_user_vector_list.append(user_vector[i].sum()/len(user_vector[i]))

#Putting into a vector dataframe
final_user_vector = pd.DataFrame(columns=user_vector.columns,)
final_user_vector.loc[0] = final_user_vector_list

##Generate Recc Songs
result = generate_rec(databaseDF,database_vector,final_user_vector)
result

result['id'] = result['id'].apply(lambda x: 'https://open.spotify.com/track/' + str(x))
# Convert the 'id' column to HTML hyperlinks
pd.set_option('display.max_colwidth', None)

Unnamed: 0,id,songName,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumental,liveness,valence,tempo,sim
390972,2mc3zZV0zf7PBLS6j8pspA,Hooker,0.604,0.896,5,-2.478,1,0.179,0.0464,4e-06,0.19,0.442,146.275,0.993219
187931,7MtpXnAhLKdkXjoFaK95TB,katharsis,0.555,0.828,8,-3.193,1,0.15,0.0397,0.0,0.163,0.391,144.276,0.992397
154201,3kQQdvJufvdRkXlz8PZWjP,Calm Down,0.595,0.93,1,-2.121,1,0.099,0.0385,0.0,0.253,0.462,151.996,0.992341
496599,6l7EaSqG6hIBBztg8xi1Xm,The P.A.S.E.O. (The Poem Aaron Saw Extra Ordin...,0.619,0.831,11,-3.836,1,0.0772,0.0715,0.0,0.179,0.392,148.342,0.992332
89895,2mQ7KmAI8gFLH3baaDXc6o,How Deep Is Your Love - DJ Snake Remix,0.616,0.903,9,-0.475,1,0.0938,0.015,0.00102,0.141,0.446,149.93,0.992305


### Weightage + Genre

In [10]:
def generate_rec(databaseDF, database_vector, user_vector, genre_top3, useSpotifyGenre):
    #Cosine Similarity
    databaseDF["sim"] = cosine_similarity(database_vector,user_vector)

    #Drop rows with different genre from top 3 genres
    if len(genre_top3) == 1:
        databaseDF = databaseDF[(databaseDF["genre"] == genre_top3[0])]
    elif len(genre_top3) == 2:
        databaseDF = databaseDF[(databaseDF["genre"] == genre_top3[0]) | (databaseDF["genre"] == genre_top3[1])]
    else:
        databaseDF = databaseDF[(databaseDF["genre"] == genre_top3[0]) | (databaseDF["genre"] == genre_top3[1]) | (databaseDF["genre"] == genre_top3[2])]
    #Sort and recommend top 5 with same genres
    rec_top5 = databaseDF.sort_values('sim',ascending = False).head()
    return rec_top5




from bs4 import BeautifulSoup
from selenium import webdriver



##GENRE ENGINEERING
genre_dict = {
    "pop": ["pop punk", "pop rap", "pop rock", "synthpop","dance-pop", "singer-songwriter"],
    "rock": ["psych rock","power pop","alt rock", "hard rock","emo","blues", "folk", "garage", "pop rock", "acoustic", "rock and roll", "singer-songwriter", ""],
    "metal": ["metalcore","black metal", "death metal", "heavy metal", "goth", "gothic metal", "groove", "punk", "grindcore", "industrial", "alternative metal"],
    "house": ["chicago house", "progressive house", "deep house", "edm", "electro house", "future house", "tech house"],
    "country": ["country pop", "country blues", "country rap", "country rock"],
    "r&b": ["soul","dance","contemporary r&b", "alternative r&b", "soul jazz", "r&b and soul","rhythm and blues"],
    "techno": ["detroit techno","minimal techno", "hardcore"],
    "electro": ["drum and bass","dubstep","electronic", "club", "electro house", "electronic dance music", "electro swing", "electropop"],
    "hip hop": ["rap", "funk", "alternative hip hop", "rap rock", "rap metal", "jazz rap"],
    "k-pop": [],
    "indie": ["indie rock", "indie pop"]
}

def generaliseGenre(genre):
    for key,value in genre_dict.items():
        ##if genre scrapped is same as generic genre(key), return genre
        if key == genre:
            return key
        else:
            ##iterate through all the genres (value) in the dict to find suitable generic genre
            for alt_genre in value:
                if alt_genre == genre:
                    return key
    ##return none if no genre found
    return None

def scrap_genre(artistName):
    ##URL of website to scrap
    url = f"https://www.getgenre.com/artist/{artistName}"


    ##CHANGE PATH TO PATH OF DOWNLOADED CHROMEDRIVER
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=options)
    
    # WAIT FOR BUTTON TAG TO LOAD BEFORE CONTINUING
    driver.implicitly_wait(20)
    driver.get(url)
    
    try:
        classtext = "MuiButtonBase-root MuiButton-root MuiButton-outlined MuiButton-outlinedPrimary MuiButton-sizeMedium MuiButton-outlinedSizeMedium MuiButton-root MuiButton-outlined MuiButton-outlinedPrimary MuiButton-sizeMedium MuiButton-outlinedSizeMedium css-x3ahaf"
        button = driver.find_element(By.XPATH, '//*[@id="genres-text"]/div/button')
        time.sleep(5)
        # Parse the HTML content of the button with BS4
        html = button.get_attribute("outerHTML")
        driver.close()
        driver.quit()

        #Use BS4 to read HTML segement and extract Genre Text
        soup = BeautifulSoup(html, "html.parser")
        # Find html button tag with class to obtained genre text
        genre = soup.find('button', class_=classtext).get_text().lower()
        result = generaliseGenre(genre)
        return result
    except:
        driver.close()
        driver.quit()
        pass




##Import User Playlist
userDF = pd.read_csv("User_Playlist.csv")
userDF = userDF.drop(columns=["Unnamed: 0"])
##Import Dataset
df = pd.read_csv("spotify_dataset_eda.csv")
df = df.drop(columns=["Unnamed: 0"])
df = df.dropna()

#Extract all genres from dataset
datasetGenres_list = []
for i in df["genre"]:
    if i not in datasetGenres_list:
        datasetGenres_list.append(i)

# Flag to indicate whether need to use Method 3 or not
useSpotifyGenre = False

song_genres_list = []
##Find Playlist Songs in Database and replace genre

for index, row in userDF.iterrows():
    id = row["id"]
    artistName = row["artist"]

    ##List to include all genres from one song
    one_song_genre = []

    ##variable to check if song in database
    dataset_genre = df[df["id"] == id]

    if not dataset_genre.empty:
        ##METHOD1 - Copying Dataset genre to User Dataset
        one_song_genre = dataset_genre["genre"].to_list()
    else:
        ##METHOD2 - Web Scraping
        genre = scrap_genre(artistName)
        if genre != "":
            one_song_genre.append(genre)

    ##CHECKS IF METHOD 1/2 managed to obtain genre data
    song_genres_list.append(one_song_genre)

userDF["genre"] = song_genres_list
userDF





# (2) Find top 3 genres in playlist
exploded_df = userDF.explode('genre')

freq = exploded_df["genre"].value_counts(sort=True)

genres_count = dict(freq.head(3))
genre_top3 = list(genres_count.keys())
        
print("Top 3 Genre's of User Playlist:" + str(genre_top3))
exploded_df


#Extract Database CSV
databaseDF = pd.read_csv("spotify_dataset_eda.csv", encoding="utf_8_sig")
databaseDF = databaseDF.drop(columns=['Unnamed: 0'])
#Selects columns that we want
databaseDF = databaseDF[["id","artist","songName", "genre","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
        "valence","tempo",]]

#Create new feature/column of artist_songName
databaseDF["artist_songName"] = databaseDF["artist"] + "_" + databaseDF["songName"]

##Change all values to lowercase
databaseDF["artist_songName"] = databaseDF["artist_songName"].str.lower()

#Check for duplicates in database
databaseDF = databaseDF.drop_duplicates(subset=["artist_songName"],ignore_index= True)



#Extract User Playlist CSV
playlistDF = pd.read_csv("User_Playlist.csv", encoding="utf_8_sig")

playlistDF = playlistDF.drop(columns=['Unnamed: 0'])
#Reset Index
playlistDF = playlistDF.reset_index(drop = True)  
#Selects columns that we want
playlistDF = playlistDF[["id","artist","songName","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumental","liveness",
        "valence","tempo",]] 

#Create new feature/column of artist_songName to remove duplicate later on
playlistDF["artist_songName"] = playlistDF["artist"] + "_" + playlistDF["songName"]
##Change all values to lowercase
playlistDF["artist_songName"] = playlistDF["artist_songName"].str.lower()

#Check for duplicates in user Playlist
playlistDF = playlistDF.drop_duplicates(subset=["artist_songName"],ignore_index= True)


#Merge user + dataset dataframe to normalise 
#Normalise takes min and max in dataframe as reference and change it to 0 and 1 respectively

#Group the dataframe as we gonna split it again alter
databaseDF["type"] = "Dataset"
playlistDF["type"] = "User"


#Merge the 2 datasets together
combinedDF = pd.concat([databaseDF,playlistDF], ignore_index=True)

#Check for duplicates between user and Database
combinedDF = combinedDF.drop_duplicates(subset=["artist_songName"], keep="last",ignore_index= True)

##Update databaseDF with removed songs from user Playlist
databaseDF = combinedDF[(combinedDF["type"]== "Dataset")]

##Normalise and get Vectors for Dataset + User
normalised_vector = get_features_database(combinedDF,feature_weight)

##Seperate User from databaseDF 
database_vector = normalised_vector[normalised_vector["type"] == "Dataset"]
user_vector = normalised_vector[normalised_vector["type"] == "User"]

#Drop "type" column
database_vector = database_vector.drop(columns="type")
user_vector = user_vector.drop(columns="type")
databaseDF = databaseDF.drop(columns=["type"])




##Single Vector Creation
final_user_vector_list = []
for i in user_vector.columns:
    final_user_vector_list.append(user_vector[i].sum()/len(user_vector[i]))

#Putting into a vector dataframe
final_user_vector = pd.DataFrame(columns=user_vector.columns,)
final_user_vector.loc[0] = final_user_vector_list


##Generate Recc Songs
result = generate_rec(databaseDF,database_vector,final_user_vector, genre_top3, useSpotifyGenre)
result

result['id'] = result['id'].apply(lambda x: 'https://open.spotify.com/track/' + str(x))
# Convert the 'id' column to HTML hyperlinks
pd.set_option('display.max_colwidth', None)

Cache folder (C:\Users\jiowe\.cache\selenium) cannot be created: Cannot create a file when that file already exists. (os error 183)
Cache folder (C:\Users\jiowe\.cache\selenium) cannot be created: Cannot create a file when that file already exists. (os error 183)
Cache folder (C:\Users\jiowe\.cache\selenium) cannot be created: Cannot create a file when that file already exists. (os error 183)
Cache folder (C:\Users\jiowe\.cache\selenium) cannot be created: Cannot create a file when that file already exists. (os error 183)
Cache folder (C:\Users\jiowe\.cache\selenium) cannot be created: Cannot create a file when that file already exists. (os error 183)
Cache folder (C:\Users\jiowe\.cache\selenium) cannot be created: Cannot create a file when that file already exists. (os error 183)
Cache folder (C:\Users\jiowe\.cache\selenium) cannot be created: Cannot create a file when that file already exists. (os error 183)
Cache folder (C:\Users\jiowe\.cache\selenium) cannot be created: Cannot crea

Top 3 Genre's of User Playlist:['metal', 'country', 'hip hop']


Unnamed: 0,id,artist,songName,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumental,liveness,valence,tempo,artist_songName,sim
60273,7aftSOGSOpSoIlVAQVBb71,Sam Hunt,Break Up In A Small Town,country,0.579,0.776,8,-5.365,1,0.173,0.0749,0.0,0.239,0.434,136.044,sam hunt_break up in a small town,0.991965
279842,4B0nJLbPzVxWt7o99SiGrO,Tyler Hubbard,35’s,country,0.535,0.79,10,-5.698,1,0.0553,0.00358,0.0,0.257,0.402,139.929,tyler hubbard_35’s,0.991776
287845,03Mfd5obNNFGKWKBWP1B9t,GHOST DATA,Adagio For Souls,hip hop,0.501,0.748,5,-5.112,1,0.0942,0.0863,0.000152,0.181,0.389,145.077,ghost data_adagio for souls,0.991731
448560,2QXO5ceopNHeYUyRTp78Wy,.sPout.,Take Me (Take Me),metal,0.583,0.786,0,-4.347,1,0.133,0.00189,1.9e-05,0.13,0.416,139.975,.spout._take me (take me),0.991686
390244,17U4zkPHe30IUnikRdJu2L,Sepultura,The Waste (with Mike Patton),metal,0.603,0.824,10,-6.067,1,0.0662,0.000687,0.0517,0.153,0.421,133.399,sepultura_the waste (with mike patton),0.991553
