## Import Data

In [1]:
import pandas as pd

df_clustering = pd.read_csv('df_clustering.csv')
merged_df_cluster = pd.read_csv('merged_df_cluster.csv')

## Initialize Spotipy

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

secrets_file = open("secrets.txt","r")
string = secrets_file.read()

secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1]
        
#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['clientid'],
                                                           client_secret=secrets_dict['clientsecret']
                                                           ))

## Create Transformer

In [3]:
from sklearn.preprocessing import StandardScaler
transformer = StandardScaler().fit(df_clustering)
X_prep = transformer.transform(df_clustering)
pd.DataFrame(X_prep,columns=df_clustering.columns).head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.283739,0.565427,0.279158,0.877677,-0.592032,-0.403138,-0.195119,-0.080821,1.401571,-0.203087,0.185143
1,0.717151,0.858514,0.748519,-0.159008,-0.476902,-0.403138,-0.90879,1.348875,-1.152776,-0.251831,0.185143
2,1.505719,0.672004,0.768338,0.136997,-0.703725,-0.403138,-0.208118,0.638043,-0.479045,-0.154848,0.185143
3,0.337916,0.996175,0.972839,0.408892,-0.567975,-0.403138,0.922837,1.236427,-0.728245,-0.304451,0.185143
4,0.27772,-1.210854,-0.627583,-0.292946,1.265507,-0.401253,-0.561704,-1.092854,-0.442908,-0.06533,0.185143


In [52]:
kmeans = KMeans(n_clusters=10, random_state=1234)
kmeans.fit(X_prep)

clusters = kmeans.predict(X_prep)

# Check the size of the clusters
pd.Series(clusters).value_counts().sort_index()

0    1343
1    2055
2     569
3     602
4     352
5      99
6     406
7     468
8    1462
9    2588
dtype: int64

## Get the top 100 freshly

In [4]:
import requests
from bs4 import BeautifulSoup

# Send an HTTP request to the webpage
url = 'https://www.popvortex.com/music/charts/top-100-songs.php'
page = requests.get(url)

# Parse the HTML response
soup = BeautifulSoup(page.text, 'html.parser')

title = []
artist = []

num_iter = len(soup.select("div.chart-content.col-xs-12.col-sm-8 > p .title"))

titles = soup.select("div.chart-content.col-xs-12.col-sm-8 > p .title")
artists = soup.select("div.chart-content.col-xs-12.col-sm-8 > p .artist")
# iterate through the result set and retrive all the data
for i in range(num_iter):
    title.append(titles[i].get_text())
    artist.append(artists[i].get_text())

# each list becomes a column
top100 = pd.DataFrame({"title":title,
                       "artist":artist
                      })
top100

Unnamed: 0,title,artist
0,Flowers,Miley Cyrus
1,VIBE (feat. Jimin of BTS),TAEYANG
2,"Shakira: Bzrp Music Sessions, Vol. 53",Bizarrap & Shakira
3,Anti-Hero,Taylor Swift
4,Unholy,Sam Smith & Kim Petras
...,...,...
95,No Horse To Ride,Luke Grimes
96,Goodness of God (Live),CeCe Winans
97,Handle On You,Parker McCollum
98,For Your Love,The Yardbirds


In [9]:
top100['title']=top100['title'].str.lower()
top100['artist']=top100['artist'].str.lower()
top100.head(2)

Unnamed: 0,title,artist
0,flowers,miley cyrus
1,vibe (feat. jimin of bts),taeyang


In [98]:
import time
from pandas import json_normalize
from sklearn.cluster import KMeans

artist = input("Please enter the artist you are listening to? ", ).lower()
title = input("What song are you listening to from this artist? ", ).lower()
print("\n")
music = [artist,title]

if title in top100.values:
    print("Try this song next!")
    print("\n")
    print(str(top100.sample(n=1)).title())
    
elif music[0] in top100.values:
    print("Try this song next!")
    print("\n")
    print(str(top100.sample(n=1)).title())
    
elif title not in top100.values:
    try:
        
        # Step 1 - Replace q with music[1] or title
        results = sp.search(q='track:' + (''+ title +'') + ', artist:' + (''+ artist +''), type='track', limit=1)

        # Step 2 - Find the uri of that song
        song_uri = results['tracks']['items'][0]['uri']

        # Step 3 - Find the uri on spotify
        audio_features = sp.audio_features(song_uri)

        # Step 4 - Get a dataframe
        audio_features = json_normalize(audio_features)
        audio_features = audio_features.drop(["key","mode", 'id','type','track_href','uri', 'analysis_url'], axis = 1)

        # Step 5 - Scale audio_features
        scale_audio_features = transformer.transform(audio_features)

        # Step 6 - Find the cluster
        cluster = kmeans.predict(scale_audio_features)
        cluster = pd.DataFrame(cluster)
        cluster.columns = ['pred_cluster']

        # Step 7 - Chose a random song from the cluster
        pd.options.display.max_colwidth = 300
        songs = pd.merge(left = cluster,
                         right = merged_df_cluster,
                         how = 'inner',
                         left_on = 'pred_cluster',
                         right_on = "clusters")
        song_rec = songs[['artist', 'song', 'url']].sample(n=1)
        artist = song_rec['artist'].to_string(index=False)
        song = song_rec['song'].to_string(index=False)
        link = song_rec['url'].to_string(index=False)
        
        print("We recommend the following song " + song + " by " + artist + "." + "\n" + "Listen to it here: " + "\n"+ link + " ")
        
    except:
        print('\nPlease try again. Maybe you misspelled something?')
        time.sleep(5) 
else:
    print("Sorry I cannot recommend you a song at the moment")

Please enter the artist you are listening to? Sofiane Pamart
What song are you listening to from this artist? Borealis


We recommend the following song Hydrangea by Knowmadic.
Listen to it here: 
https://open.spotify.com/track/3z3QKouio76OV0VVtwAXdy 
