# Building the reconmender

# 0. Preparation

#### Importing libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from random import randint
from time import sleep
from itertools import islice
from pandas import json_normalize

In [3]:
from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler
from matplotlib.lines import Line2D

In [4]:
import random
import pickle
from sklearn.decomposition import PCA

#### Importing our music datafile

In [5]:
df_list = []
for i in range(1,101):
    df_list.append(pd.read_csv('./Data/part'+str(i)+'.csv'))
df_songs = pd.concat(df_list)
display(df_songs.head(1))
df_songs.shape

Unnamed: 0,uri,title,artist_name,artist_id,album_id,album_name,length,explicit,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,spotify:track:7zgqtptZvhf8GEmdsM2vp2,...Ready For It?,Taylor Swift,06HL4z0CvFAxyc27GXpf02,0HG8fMDhvN2tH5uPHFsyZP,...Ready For It?,208198,False,0,0.615,0.779,2,-6.454,1,0.135,0.0665,0.0,0.155,0.453,160.0


(10000, 20)

In [6]:
df_songs.reset_index(drop = True)

Unnamed: 0,uri,title,artist_name,artist_id,album_id,album_name,length,explicit,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,spotify:track:7zgqtptZvhf8GEmdsM2vp2,...Ready For It?,Taylor Swift,06HL4z0CvFAxyc27GXpf02,0HG8fMDhvN2tH5uPHFsyZP,...Ready For It?,208198,False,0,0.615,0.779,2,-6.454,1,0.1350,0.06650,0.000000,0.1550,0.453,160.000
1,spotify:track:4Vxu50qVrQcycjRyJQaZLC,Life Changes,Thomas Rhett,6x2LnllRG5uGarZMsD4iO8,4w5Jvreahp3yvLqc4vCr9I,Life Changes,190226,False,63,0.687,0.845,7,-4.370,1,0.0576,0.10000,0.000000,0.0452,0.809,87.972
2,spotify:track:6b8Be6ljOzmkOmFslEb23P,24K Magic,Bruno Mars,0du5cEVh5yTK9QJze8zA0C,4PgleR09JVnm3zY1fW3XBA,24K Magic,225983,False,81,0.818,0.803,1,-4.282,1,0.0797,0.03400,0.000000,0.1530,0.632,106.970
3,spotify:track:0afhq8XCExXpqazXczTSve,Galway Girl,Ed Sheeran,6eUKZXaKkcviH0Ku9w2n3V,3T4tUhGYeRNVUGevb0wThu,÷ (Deluxe),170826,False,80,0.624,0.876,9,-3.374,1,0.1000,0.07350,0.000000,0.3270,0.781,99.943
4,spotify:track:1HNkqx9Ahdgi1Ixy2xkKkL,Photograph,Ed Sheeran,6eUKZXaKkcviH0Ku9w2n3V,1xn54DMo2qIqBuMqHtUsFd,x (Deluxe Edition),258986,False,86,0.614,0.379,4,-10.480,1,0.0476,0.60700,0.000464,0.0986,0.201,107.989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,spotify:track:4UFlPCB4THnQ9TlPHqIQow,Funeral For A Friend / Love Lies Bleeding,Elton John,3PhoLpVuITZKcymswpck5b,6Gp6qSE1ywXCLjal5NUWUE,Goodbye Yellow Brick Road (40th Anniversary Ce...,666571,False,0,0.410,0.761,9,-8.507,0,0.0465,0.01980,0.084700,0.2470,0.193,138.712
9996,spotify:track:5pSSEkT0963muzzIjsVkrs,Fool's Overture,Supertramp,3JsMj0DEzyWc0VDlHuy9Bx,4X87hQ57jTYQTcYTaJWK5w,Even In The Quietest Moments,652560,False,55,0.406,0.306,3,-10.482,1,0.0372,0.31300,0.007900,0.0727,0.073,135.272
9997,spotify:track:7gC6Rbllqf1yXNC02e5jz2,Heart of the Sunrise - 2003 Remaster,Yes,7AC976RDJzL2asmZuz7qil,0dZF93WHyOhTWjz5EWM7yG,Fragile (Deluxe Edition),634440,False,47,0.362,0.507,1,-11.229,1,0.0394,0.01740,0.216000,0.1130,0.456,146.641
9998,spotify:track:6Ff77WXC58MkhLE5A1qgY1,Venus And Mars / Rock Show / Jet - Live / Rema...,Paul McCartney,4STHEaNw4mPZ2tzheohgXB,2GVLsiEMDZhxOMATIPBK4d,Wings Over America (Remastered),620746,False,0,0.331,0.733,2,-8.671,1,0.0468,0.08870,0.001740,0.9470,0.380,128.512


#### Importing our scaler, classifier and pca

In [7]:
kmean = pickle.load(open('kmeans10.p','rb'))
kmean

In [8]:
pca = pickle.load(open('pca.p','rb'))
pca

In [9]:
scaler = pickle.load(open('scaler.p','rb'))
scaler

# Task

1. Check whether or not the song is in the Billboard Hot 200.
2. Collect the audio features from the Spotify API.
After that, you want to send the Spotify audio features of the submitted song to the clustering model, which should return a cluster number.

# 1 Check if song is in the top 100 song list

In [10]:
def check_100(song,artist):
    # Getting content of the page
    url = "http://www.popvortex.com/music/charts/top-100-songs.php"
    try:
        response = requests.get(url, headers = {"Accept-Language": "en-US"})
    except:
        # Stopping if we get a bad response
        print('Error while connecting to popvortex')
        return False, [song,artist]
    
    # Parsing
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Creating top 100 list
    titles = []
    artists = []
    
    for t in range(len(soup.select('.title'))):
        titles.append(soup.select('.title')[t].get_text())
    
    for a in range(len(soup.select('.artist'))):
        artists.append(soup.select('.artist')[a].get_text())
    
    # Creating dataframe
    songs = pd.DataFrame({"title":titles,
                       "artist":artists,
                      })
    
    # Checking if input is in dataframe
    userentry = songs[(songs.title == song) & (songs.artist == artist)].index
    
    # Responding
    if userentry.size == 0:
        return False, [song,artist]
    else:
        # Choose a random number except the one from the userinput
        reconmendation = random.randint(0,99)
        # If the random number is the the same as the userinput we have to redo the random
        while reconmendation == userentry:
           reconmendation = random.randint(0,99)
        # returning True and the song
        return True, [songs.loc[reconmendation,'title'],songs.artist[reconmendation]]

In [11]:
print(check_100('Anti-Hero','Taylor Swift'))

(True, ["Something in the Orange (Z&E's Version)", 'Zach Bryan'])


In [12]:
print(check_100('Not a song','Donald Trump'))

(False, ['Not a song', 'Donald Trump'])


# 2. Collect the audio features from the Spotify API.

#### Getting access to the spotify API

In [13]:
def access_get():
    secrets_file = open("Access.txt","r")
    string = secrets_file.read()
    secrets_dict={}
    for line in string.split('\n'):
        if len(line) > 0:
            secrets_dict[line.split(':')[0]]=line.split(':')[1]
    # Initialising connection
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['clientid'],
                                                           client_secret=secrets_dict['clientsecret']))
    return sp

In [14]:
sp = access_get()
sp

<spotipy.client.Spotify at 0x1e335b0d7f0>

#### First we need to get the song uri to be able to extract the features

In [15]:
def find_song_uri(song,artist):
    try:
        # Searching for the song
        result = sp.search(q=song+' '+artist, limit=1)
        uri = result['tracks']['items'][0]['uri']
        return uri
    except:
        # Returning a placeholder in case of a connection error
        print('Connection error while finding uri')
        return find_song_uri(song,artist)

In [16]:
find_song_uri('Die Ärzte','Westerland')

'spotify:track:5aWpvFnByyWodgqYlC9kha'

#### Now getting the features

In [17]:
def get_features(uri):
    try:
        return sp.audio_features(uri)
    except:
        sleep(randint(1,2)) # respectful nap in case connection breaks
        print('Error occured while getting feature names')
        return get_features(uri)

In [18]:
features = get_features('spotify:track:5aWpvFnByyWodgqYlC9kha')

In [19]:
features[0]['danceability']

0.602

#### Making prediction based on features

First we have to transform the features into a useful format

In [20]:
def flatten_features(features):
    featurelist = []
    try:
        featurelist = [[
        features[0]['danceability'],
        features[0]['energy'],
        features[0]['key'],
        features[0]['loudness'],
        features[0]['mode'],
        features[0]['speechiness'],
        features[0]['acousticness'],
        features[0]['instrumentalness'],
        features[0]['liveness'],
        features[0]['valence'],
        features[0]['tempo']
            ]]
    except:
        featurelist.append([0,0,0,0,0,0,0,0,0,0,0])
    return pd.DataFrame(featurelist, columns = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo'])

In [21]:
features_flat = flatten_features(features)

In [22]:
def prediction(features,kmodel):
    # Normalize
    features_scaled = pd.DataFrame(scaler.transform(features), columns=features.columns)
    # Apply PCA
    features_scaled_pca = pca.transform(features_scaled)
    # Predict
    cluster = kmodel.predict(features_scaled_pca)
    
    return cluster[0]

In [23]:
prediction(features_flat,kmean)

0

# 3. Buildung the reconmender
1. As an input the reconmender will take a song title and an artist
2. If the song is in the top 100 it will return a random song from that list
3. If not, it will get the audio features from spotify and predict a class, and reconmend a song from there

#### 0. Classify df_songs
To make a reconmendation, we first have to classify our existing songs

In [24]:
# pd.DataFrame(df_songs.iloc[1][['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo']]).T

In [46]:
def classifier(kmodel):
    pred_list = []
    # Making predictions
    for rownum in df_songs.index:
        # do something with a row
        pred_list.append(prediction(pd.DataFrame(df_songs.iloc[rownum][['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo']]).T,kmodel))
    df_songs['class'] = pred_list

In [57]:
# classifier(kmean)

In [27]:
# df_songs.head(10)

#### Saving to csv
Getting all predictions took a while. I will save the dataframe to csv, so that I dont have to redo it.

In [28]:
# df_songs.to_csv('./Data/songspredicted.csv', index=False)

In [29]:
songs_predicted = pd.read_csv('./Data/songspredicted.csv')

In [30]:
songs_predicted['class'].value_counts()

0    3700
3    2300
2    1600
5    1000
6     900
7     200
1     200
8     100
Name: class, dtype: int64

## The Reconmender

In [82]:
def reconmender(kmodel,songs_predicted):
    # Getting the input
    song = input('Enter a song:   ')
    artist = input('Enter an artist:   ')
    
    # We check if the song is in the top 100
    boolean, reconmendation = check_100(song,artist)
    if boolean == True:
        # If the song is in the top 100 we return the reconmendation from there
        print('\n\n\n''You might like "' + reconmendation[0] +'" from "' + reconmendation[1] + '" ! One of the top 100 songs on popvortex!')
        return [reconmendation[0],reconmendation[1]]

    
    # If not we continue with spotify
    uri = find_song_uri(song,artist)
#     print(uri)
    features = get_features(uri)
    features = flatten_features(features)
    pred = prediction(features,kmodel)
    
    # We specify from which songs there will be a selection:
    # We have to take care if the prediction class exists:
    if pred in songs_predicted['class'].unique():
        selection = songs_predicted[songs_predicted['class'] == pred].reset_index(drop = True)
    else:
        print('No similar song in the database!')
        return
    
    # Selecting a random song from the selection
    rec_nr = random.randint(0,len(selection))
#     print(rec_nr)
    title_r = selection.iloc[rec_nr]['title']
    artist_r = selection.iloc[rec_nr]['artist_name']
    
    print('\n\n\n''You might like "' + title_r +'" from "' + artist_r + '" !\nThis song has similar audiofeatures to your selcetion!')

# Testing

In [32]:
reconmender(kmean,songs_predicted)

Enter a song:   Nemo
Enter an artist:   Nightwish
spotify:track:5OF7nhjVV0dmngqYjDrZGw



You might like "Without Me" from "Eminem" !
This song has similar audiofeatures to your selcetion!


# Conclusion

Thge tests did not feel like a good reconmendation, I will try again with a higher number of K

In [33]:
kmean30 = pickle.load(open('kmeans30.p','rb'))
kmean30

#### We have to reclassify our dataset

In [56]:
# classifier(kmean30)

In [36]:
# df_songs.to_csv('./Data/songspredicted30.csv', index=False)
songs_predicted30 = pd.read_csv('./Data/songspredicted30.csv')

In [37]:
# songs_predicted30['class'].value_counts()

# Trying improved reconmender

In [38]:
reconmender(kmean30,songs_predicted30)

Enter a song:   Master of Puppets
Enter an artist:   Metallica
spotify:track:54bm2e3tk8cliUz3VSdCPZ



You might like "2 Cigarettes" from "Jack & Jack" !
This song has similar audiofeatures to your selcetion!


# More classes
The songs got much more similar we try to get with 45 classes

In [39]:
kmean45 = pickle.load(open('kmeans45.p','rb'))
kmean45

In [47]:
classifier(kmean45)

In [48]:
df_songs.to_csv('./Data/songspredicted45.csv', index=False)
songs_predicted45 = pd.read_csv('./Data/songspredicted45.csv')

# Final version

#### Testing again

In [84]:
reconmender(kmean45,songs_predicted45)

Enter a song:   Baby Shark
Enter an artist:    



You might like "Can't Believe" from "Staind" !
This song has similar audiofeatures to your selcetion!


## Why it did not work in class:

In [88]:
len(songs_predicted45['class'].unique())

26

Apparently while classifying some classes were omitted, if the new song lies within the classes, that do not appear in the dataframe, the selection does not work, since it obviously does not find songs of the same class.

So either I would stick with the lower amount of classes or accept that it could not assign the input song to an existing group and print out a corresponding message.

I put in an if check for this case and print out, that no similar song was found.