# Building the reconmender

# 0. Preparation

#### Importing libraries

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from random import randint
from time import sleep
from itertools import islice
from pandas import json_normalize

In [None]:
from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler
from matplotlib.lines import Line2D

In [None]:
import random
import pickle
from sklearn.decomposition import PCA

#### Importing our music datafile

In [None]:
df_list = []
for i in range(1,101):
    df_list.append(pd.read_csv('./Data/part'+str(i)+'.csv'))
df_songs = pd.concat(df_list)
display(df_songs.head(1))
df_songs.shape

In [None]:
df_songs.reset_index(drop = True)

#### Importing our scaler, classifier and pca

In [None]:
kmean = pickle.load(open('kmeans10.p','rb'))
kmean

In [None]:
pca = pickle.load(open('pca.p','rb'))
pca

In [None]:
scaler = pickle.load(open('scaler.p','rb'))
scaler

# Task

1. Check whether or not the song is in the Billboard Hot 200.
2. Collect the audio features from the Spotify API.
After that, you want to send the Spotify audio features of the submitted song to the clustering model, which should return a cluster number.

# 1 Check if song is in the top 100 song list

In [None]:
def check_100(song,artist):
    # Getting content of the page
    url = "http://www.popvortex.com/music/charts/top-100-songs.php"
    try:
        response = requests.get(url, headers = {"Accept-Language": "en-US"})
    except:
        # Stopping if we get a bad response
        print('Error while connecting to popvortex')
        return False, [song,artist]
    
    # Parsing
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Creating top 100 list
    titles = []
    artists = []
    
    for t in range(len(soup.select('.title'))):
        titles.append(soup.select('.title')[t].get_text())
    
    for a in range(len(soup.select('.artist'))):
        artists.append(soup.select('.artist')[a].get_text())
    
    # Creating dataframe
    songs = pd.DataFrame({"title":titles,
                       "artist":artists,
                      })
    
    # Checking if input is in dataframe
    userentry = songs[(songs.title == song) & (songs.artist == artist)].index
    
    # Responding
    if userentry.size == 0:
        return False, [song,artist]
    else:
        # Choose a random number except the one from the userinput
        reconmendation = random.randint(0,99)
        # If the random number is the the same as the userinput we have to redo the random
        while reconmendation == userentry:
           reconmendation = random.randint(0,99)
        # returning True and the song
        return True, [songs.loc[reconmendation,'title'],songs.artist[reconmendation]]

In [None]:
print(check_100('Anti-Hero','Taylor Swift'))

In [None]:
print(check_100('Not a song','Donald Trump'))

# 2. Collect the audio features from the Spotify API.

#### Getting access to the spotify API

In [None]:
def access_get():
    secrets_file = open("Access.txt","r")
    string = secrets_file.read()
    secrets_dict={}
    for line in string.split('\n'):
        if len(line) > 0:
            secrets_dict[line.split(':')[0]]=line.split(':')[1]
    # Initialising connection
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['clientid'],
                                                           client_secret=secrets_dict['clientsecret']))
    return sp

In [None]:
sp = access_get()
sp

#### First we need to get the song uri to be able to extract the features

In [None]:
def find_song_uri(song,artist):
    try:
        # Searching for the song
        result = sp.search(q=song+' '+artist, limit=1)
        uri = result['tracks']['items'][0]['uri']
        return uri
    except:
        # Returning a placeholder in case of a connection error
        print('Connection error while finding uri')
        return find_song_uri(song,artist)

In [None]:
find_song_uri('Die Ärzte','Westerland')

#### Now getting the features

In [None]:
def get_features(uri):
    try:
        return sp.audio_features(uri)
    except:
        sleep(randint(1,2)) # respectful nap in case connection breaks
        print('Error occured while getting feature names')
        return get_features(uri)

In [None]:
features = get_features('spotify:track:5aWpvFnByyWodgqYlC9kha')

In [None]:
features[0]['danceability']

#### Making prediction based on features

First we have to transform the features into a useful format

In [None]:
def flatten_features(features):
    featurelist = []
    try:
        featurelist = [[
        features[0]['danceability'],
        features[0]['energy'],
        features[0]['key'],
        features[0]['loudness'],
        features[0]['mode'],
        features[0]['speechiness'],
        features[0]['acousticness'],
        features[0]['instrumentalness'],
        features[0]['liveness'],
        features[0]['valence'],
        features[0]['tempo']
            ]]
    except:
        featurelist.append([0,0,0,0,0,0,0,0,0,0,0])
    return pd.DataFrame(featurelist, columns = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo'])

In [None]:
features_flat = flatten_features(features)

In [None]:
def prediction(features,kmodel):
    # Normalize
    features_scaled = pd.DataFrame(scaler.transform(features), columns=features.columns)
    # Apply PCA
    features_scaled_pca = pca.transform(features_scaled)
    # Predict
    cluster = kmodel.predict(features_scaled_pca)
    
    return cluster[0]

In [None]:
prediction(features_flat,kmean)

# 3. Buildung the reconmender
1. As an input the reconmender will take a song title and an artist
2. If the song is in the top 100 it will return a random song from that list
3. If not, it will get the audio features from spotify and predict a class, and reconmend a song from there

#### 0. Classify df_songs
To make a reconmendation, we first have to classify our existing songs

In [None]:
# pd.DataFrame(df_songs.iloc[1][['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo']]).T

In [None]:
def classifier(kmodel):
    pred_list = []
    # Making predictions
    for rownum in df_songs.index:
        # do something with a row
        pred_list.append(prediction(pd.DataFrame(df_songs.iloc[rownum][['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo']]).T,kmodel))
    df_songs['class'] = pred_list

In [None]:
# classifier(kmean)

In [None]:
# df_songs.head(10)

#### Saving to csv
Getting all predictions took a while. I will save the dataframe to csv, so that I dont have to redo it.

In [None]:
# df_songs.to_csv('./Data/songspredicted.csv', index=False)

In [None]:
songs_predicted = pd.read_csv('./Data/songspredicted.csv')

In [None]:
reconmender(kmean,songs_predicted)

In [None]:
songs_predicted['class'].value_counts()

## The Reconmender

In [None]:
def reconmender(kmodel,songs_predicted):
    # Getting the input
    song = input('Enter a song:   ')
    artist = input('Enter an artist:   ')
    
    # We check if the song is in the top 100
    boolean, reconmendation = check_100(song,artist)
    if boolean == True:
        # If the song is in the top 100 we return the reconmendation from there
        print('\n\n\n''You might like "' + reconmendation[0] +'" from "' + reconmendation[1] + '" ! One of the top 100 songs on popvortex!')
        return [reconmendation[0],reconmendation[1]]

    
    # If not we continue with spotify
    uri = find_song_uri(song,artist)
#     print(uri)
    features = get_features(uri)
    features = flatten_features(features)
    pred = prediction(features,kmodel)
    
    # We specify from which songs there will be a selection:
    # We have to take care if the prediction class exists:
    if pred in songs_predicted['class'].unique():
        selection = songs_predicted[songs_predicted['class'] == pred].reset_index(drop = True)
    else:
        print('No similar song in the database!')
        return
    
    # Selecting a random song from the selection
    rec_nr = random.randint(0,len(selection))
#     print(rec_nr)
    title_r = selection.iloc[rec_nr]['title']
    artist_r = selection.iloc[rec_nr]['artist_name']
    
    print('\n\n\n''You might like "' + title_r +'" from "' + artist_r + '" !\nThis song has similar audiofeatures to your selcetion!')

# Testing

In [None]:
# reconmender(kmean,songs_predicted)

# Conclusion

Thge tests did not feel like a good reconmendation, I will try again with a higher number of K

In [None]:
kmean30 = pickle.load(open('kmeans30.p','rb'))
kmean30

#### We have to reclassify our dataset

In [None]:
# classifier(kmean30)

In [None]:
# df_songs.to_csv('./Data/songspredicted30.csv', index=False)
songs_predicted30 = pd.read_csv('./Data/songspredicted30.csv')

In [None]:
# songs_predicted30['class'].value_counts()

# Trying improved reconmender

In [None]:
reconmender(kmean30,songs_predicted30)

# More classes
The songs got much more similar we try to get with 45 classes

In [None]:
kmean45 = pickle.load(open('kmeans45.p','rb'))
kmean45

In [None]:
# classifier(kmean45)

In [None]:
# df_songs.to_csv('./Data/songspredicted45.csv', index=False)
songs_predicted45 = pd.read_csv('./Data/songspredicted45.csv')

# Final version

#### Testing again

In [None]:
reconmender(kmean45,songs_predicted45)

## Why it did not work in class:

In [None]:
len(songs_predicted45['class'].unique())

Apparently while classifying some classes were omitted, if the new song lies within the classes, that do not appear in the dataframe, the selection does not work, since it obviously does not find songs of the same class.

So either I would stick with the lower amount of classes or accept that it could not assign the input song to an existing group and print out a corresponding message.

I put in an if check for this case and print out, that no similar song was found.