In [1]:
# Standard stuff
import numpy as np
import pandas as pd

# imports for data transformation and model
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from category_encoders import BinaryEncoder
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

# viz libraries
# import matplotlibpyplot as plt0

# sql alchemy for connecting to database
from sqlalchemy import create_engine, func
from sqlalchemy.orm import Session
from sqlalchemy.ext.automap import automap_base


import os

In [5]:
# create engine and connect to the spotify database
engine = create_engine("sqlite:///spotify_db.db")

# map the tables to objects
orm = automap_base()
orm.prepare(engine, reflect=True)

In [7]:
# read in master table from sql alchemy
df = pd.read_sql(sql='master', con=engine)

In [39]:
# start dropping useless columns
cleaned = df.drop(columns=['explicit','release_date'])
cleaned

Unnamed: 0,id,name,popularity,duration_ms,artists,id_artists,danceability,energy,key,loudness,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,decades,followers,genres
0,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,0.434,0.1770,1,-21.180,...,0.0512,0.994,0.021800,0.2120,0.4570,130.418,5,1920's,3528.0,tango
1,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,0.321,0.0946,7,-27.961,...,0.0504,0.995,0.918000,0.1040,0.3970,169.980,3,1920's,3528.0,tango
2,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,0.402,0.1580,3,-16.900,...,0.0390,0.989,0.130000,0.3110,0.1960,103.220,4,1920's,11327.0,adult standards
3,0BRXJHRNGQ3W4v9frnSfhu,Ave Maria,0,178933,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,0.227,0.2610,5,-12.343,...,0.0382,0.994,0.247000,0.0977,0.0539,118.891,4,1920's,11327.0,adult standards
4,0IA0Hju8CAgYfV1hwhidBH,La Java,0,161427,Mistinguett,4AxgXfD7ISvJSTObqm4aIE,0.563,0.1840,4,-13.757,...,0.0512,0.993,0.000016,0.3250,0.6540,133.088,3,1920's,5078.0,vintage chanson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518666,5rgu12WBIHQtvej2MdHSH0,云与海,50,258267,阿YueYue,1QLBXKM5GCpyQQSVMNZqrZ,0.560,0.5180,0,-7.471,...,0.0292,0.785,0.000000,0.0648,0.2110,131.896,4,2010's,896.0,chinese viral pop
518667,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,ROLE MODEL,1dy5WNgIKQU6ezkpZs4y8z,0.765,0.6630,0,-5.223,...,0.0652,0.141,0.000297,0.0924,0.6860,150.091,4,2010's,245944.0,pop
518668,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,FINNEAS,37M5pPGs6V1fchFJSgCguX,0.535,0.3140,7,-12.823,...,0.0408,0.895,0.000150,0.0874,0.0663,145.095,4,2010's,1168213.0,pop
518669,45XJsGpFTyzbzeWK8VzR8S,A Day At A Time,58,142003,Gentle Bones,4jGPdu95icCKVF31CcFKbS,0.696,0.6150,10,-6.212,...,0.0345,0.206,0.000003,0.3050,0.4380,90.029,4,2020's,45309.0,indie cafe pop


In [38]:
# drop columns with null values
cleaned.dropna(inplace=True)

In [9]:
# for the time being we drop all the non-numerical data
# leave artists since we will encode it 
# drop genres for now but ultimately we could encode this as a categorical variable

numerical_only = cleaned.drop(columns=['id','name','id_artists','genres'])
numerical_only.dtypes

popularity            int64
duration_ms           int64
artists              object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
decades              object
followers           float64
dtype: object

In [10]:
numerical_only.sample(5)

# [[chunchaca],
#  [chamame,folclore salteno,...],
#  []

# chunchaca 54
# chamame 24
# folclore salteno 13

Unnamed: 0,popularity,duration_ms,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,decades,followers
170052,39,235200,Yahoo,0.623,0.569,4,-9.645,1,0.0264,0.587,0.0,0.0929,0.571,134.676,4,1990's,47110.0
175951,20,341333,Ludwig van Beethoven,0.317,0.208,0,-18.255,1,0.0446,0.918,0.756,0.0912,0.329,153.33,4,1960's,3823575.0
329394,45,185294,Otilia,0.66,0.87,3,-1.822,0,0.0601,0.0438,0.0138,0.122,0.807,170.193,4,2010's,117666.0
465126,48,216720,Trzeci Wymiar,0.807,0.596,9,-5.082,0,0.0992,0.125,1e-06,0.205,0.36,94.029,4,2000's,64599.0
125246,36,63000,Bibi Blocksberg,0.429,0.623,10,-10.537,0,0.105,0.703,0.0,0.357,0.566,76.657,4,1980's,128057.0


In [11]:
# scaler that only applies to the columns 
numerical_scaler = ColumnTransformer([('numerical_scaler', 
                                       StandardScaler(), 
                                       ['popularity','duration_ms','tempo', 'loudness', 'followers']),
                                      ('be', BinaryEncoder(), ['artists', 'decades'] )],
                                     remainder='passthrough')

In [12]:
numerical_only.dropna(inplace=True)

In [19]:
# create the data pipeline
pipe = make_pipeline(numerical_scaler,  KMeans(n_clusters=5))

# fit pipeline (data transform followed by KMeans model)
pipe.fit(numerical_only)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical_scaler',
                                                  StandardScaler(),
                                                  ['popularity', 'duration_ms',
                                                   'tempo', 'loudness',
                                                   'followers']),
                                                 ('be', BinaryEncoder(),
                                                  ['artists', 'decades'])])),
                ('kmeans', KMeans(n_clusters=5))])

In [27]:
# add class label to our dataframe
numerical_only['class'] = pipe.named_steps.kmeans.labels_

# split data by class
class_1 = numerical_only[numerical_only['class']==1]
class_2 = numerical_only[numerical_only['class']==2]
class_3 = numerical_only[numerical_only['class']==3]
class_4 = numerical_only[numerical_only['class']==4]
class_5 = numerical_only[numerical_only['class']==5]

# did not reset indecies since they are needed later to indentify song name
# reset indicies
# class_1.reset_index(drop=True, inplace=True)
# class_2.reset_index(drop=True, inplace=True)
# class_3.reset_index(drop=True, inplace=True)
# class_4.reset_index(drop=True, inplace=True)
# class_5.reset_index(drop=True, inplace=True)

In [28]:
# testing suggesting a song from cluster 1
# fit knn model on data in class_1
cluster_1_knn = make_pipeline(numerical_scaler, NearestNeighbors(n_neighbors=5))
cluster_1_knn.fit(class_1)

#get a test song from class_1 (ultimately this should be replaced by pulling a song from the Spotify API)
test_song = numerical_scaler.fit_transform(class_1)[0]
test_song = test_song.reshape(1,-1)

# get the closest n_neighbors to the test song
results = cluster_1_knn.named_steps.nearestneighbors.kneighbors(test_song)

#get the indicies of the suggested songs
inds = results[1][0]


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [33]:
# create a result dataframe
inds = list(inds)
#class_1[class_1.index.isin(inds)]
pred = class_1.iloc[inds]
pred

Unnamed: 0,popularity,duration_ms,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,decades,followers,class
10,0,180400,Perchicot,0.592,0.408,10,-11.585,0,0.271,0.993,2e-06,0.413,0.766,81.126,4,1920's,28.0,1
2790,0,139831,Celia Gamez,0.63,0.59,10,-12.222,0,0.12,0.721,0.0206,0.36,0.862,93.662,4,1920's,1607.0,1
2850,0,167120,Ignacio Corsini,0.644,0.482,10,-13.706,0,0.388,0.978,4e-05,0.555,0.858,95.694,4,1920's,3528.0,1
53,0,192933,Georgius,0.625,0.35,10,-15.916,1,0.312,0.984,0.0,0.0988,0.808,81.567,4,1920's,675.0,1
1902,0,140417,H.P. Lovecraft,0.701,0.206,10,-13.235,0,0.944,0.572,0.0,0.13,0.578,80.883,4,1920's,17413.0,1


In [40]:
# merge output with cleaned data to get song name and ID
merged_song_reco = cleaned.merge(pred, left_index=True, right_index=True)
merged_song_reco

Unnamed: 0,id,name,popularity_x,duration_ms_x,artists_x,id_artists,danceability_x,energy_x,key_x,loudness_x,...,speechiness_y,acousticness_y,instrumentalness_y,liveness_y,valence_y,tempo_y,time_signature_y,decades_y,followers_y,class
10,1CZkVPFUK7wO1sMOlBHxJr,C'est La Mode Et Voila Tout,0,180400,Perchicot,2kBMqLbeN3STYqwPqJW45j,0.592,0.408,10,-11.585,...,0.271,0.993,2e-06,0.413,0.766,81.126,4,1920's,28.0,1
53,5P35CyOEQiMqf8Hd6Pty5p,Méfiez Vous D'anatole,0,192933,Georgius,0Yly6PvJghpLD3Lj8Kvpcj,0.625,0.35,10,-15.916,...,0.312,0.984,0.0,0.0988,0.808,81.567,4,1920's,675.0,1
1902,0ykQtZ84NGXhHIVFQgbxyj,Capítulo 3.6 & Capítulo 4.1 - la Casa Maldita,0,140417,H.P. Lovecraft,1QUtO0CVLXg2DonM9aCJrV,0.701,0.206,10,-13.235,...,0.944,0.572,0.0,0.13,0.578,80.883,4,1920's,17413.0,1
2790,4ZKHPjFt59tZ2D5aeuPDg8,Alas - Remastered,0,139831,Celia Gamez,6lha0NaV70St0Ij3MuXTyX,0.63,0.59,10,-12.222,...,0.12,0.721,0.0206,0.36,0.862,93.662,4,1920's,1607.0,1
2850,5ZAcsAmC4H5NPXHJqZ5WoA,Alzame en Tus Brazos - Remasterizado,0,167120,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,0.644,0.482,10,-13.706,...,0.388,0.978,4e-05,0.555,0.858,95.694,4,1920's,3528.0,1


In [17]:
## run demo with track names, pre-fit model on all 5 clusters

In [18]:
# test the number of clusters to use (uncomment to test)
# models = [KMeans(n_clusters=i).fit(numerical_only) for i in range(1,15)]
# inertia = [model.inertia_ for model in models]
# plt.plot(inertia)