In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict


import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("data.csv")
genre_data = pd.read_csv('data_by_genres.csv')

In [4]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

In [None]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
# Visualizing the Clusters with PCA

from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)

client_id="b4dad3bdf5144e6f8f408ec2f6f278a3"
client_secret="a1e9ff23036a4444abcf6067fd63c2ca"

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))


number_cols = ['valence', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

# Visualizing the Clusters with t-SNE

from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)

song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False))
                                 ], verbose=False)

X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

class Rec:
    def find_song(name):
        song_data = defaultdict()
        results = sp.search(q= name, limit=1)
        if results['tracks']['items'] == []:
            return None

        results = results['tracks']['items'][0]
        track_id = results['id']
        audio_features = sp.audio_features(track_id)[0]

        song_data['name'] = [name]
        song_data['explicit'] = [int(results['explicit'])]
        song_data['duration_ms'] = [results['duration_ms']]
        song_data['popularity'] = [results['popularity']]

        for key, value in audio_features.items():
            song_data[key] = value

        return pd.DataFrame(song_data)

    def get_song_data(song, spotify_data):

        try:
            song_data = spotify_data[(spotify_data['name'] == song['name'])].iloc[0]
            return song_data

        except IndexError:
            return Rec.find_song(song['name'])


    def get_mean_vector(song_list, spotify_data):

        song_vectors = []

        for song in song_list:
            song_data = Rec.get_song_data(song, spotify_data)
            if song_data is None:
                print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
                continue
            song_vector = song_data[number_cols].values
            song_vectors.append(song_vector)  

        song_matrix = np.array(list(song_vectors))
        return np.mean(song_matrix, axis=0)


    def flatten_dict_list(dict_list):

        flattened_dict = defaultdict()
        for key in dict_list[0].keys():
            flattened_dict[key] = []

        for dictionary in dict_list:
            for key, value in dictionary.items():
                flattened_dict[key].append(value)

        return flattened_dict


    def recommend_songs( song_list, n_songs=10):
        spotify_data = data

        metadata_cols = ['name', 'id', 'artists']
        song_dict = Rec.flatten_dict_list(song_list)

        song_center = Rec.get_mean_vector(song_list, spotify_data)
        scaler = song_cluster_pipeline.steps[0][1]
        scaled_data = scaler.transform(spotify_data[number_cols])
        scaled_song_center = scaler.transform(song_center.reshape(1, -1))
        distances = cdist(scaled_song_center, scaled_data, 'cosine')
        index = list(np.argsort(distances)[:, :n_songs][0])

        rec_songs = spotify_data.iloc[index]
        rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
        return rec_songs[metadata_cols].to_dict(orient='records')
    

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 170653 samples in 1.139s...


In [None]:
Rec.recommend_songs([{'name': 'baby'}])

In [14]:
obj = Rec

In [15]:
import pickle
with open('song_recommendation', 'wb') as f:
    pickle.dump(obj,f)

In [16]:
import pickle
with open('song_recommendation', 'rb') as f:
    m = pickle.load(f)

In [17]:
m.recommend_songs([{'name': 'one last'}])

[{'name': 'One Last Time',
  'id': '7xoUc6faLbCqZO6fQEYprd',
  'artists': "['Ariana Grande']"},
 {'name': 'Apologize',
  'id': '6ucR4KfvsBFWCMVFDvyKKl',
  'artists': "['Timbaland', 'OneRepublic']"},
 {'name': 'Austin',
  'id': '2WQAknZHrJAEQOFXe0F44P',
  'artists': "['Blake Shelton']"},
 {'name': 'Swim',
  'id': '3M0lSi5WW79CXQamgSBIjx',
  'artists': "['Chase Atlantic']"},
 {'name': 'Víveme',
  'id': '376zCxYCHr7rSFBdz41QyE',
  'artists': "['Laura Pausini']"},
 {'name': 'Life Of The Party',
  'id': '14hJ5tc1VCFMWhVn9axRTC',
  'artists': "['Shawn Mendes']"},
 {'name': 'Losing Me',
  'id': '4P6g8wuXeR3wznFk7WnI4w',
  'artists': "['Gabrielle Aplin', 'JP Cooper']"},
 {'name': 'That Should Be Me',
  'id': '0aPZbnkMoWJaJ5CNVLCj8S',
  'artists': "['Justin Bieber']"},
 {'name': 'Melting',
  'id': '2kSb3wYSOV996xA2NSmpck',
  'artists': "['Kali Uchis']"},
 {'name': 'Wildfire',
  'id': '2Ro9FLIVhPwIQopSr48oJT',
  'artists': "['Seafret']"}]