In [25]:
# Importing libraries 

import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.neighbors import KDTree
from sklearn.neighbors import NearestNeighbors

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading in and slimming down the scv file

df = pd.read_csv('tracks_features.csv')

df = df.sample(n = 500000)

## Doing some EDA

In [3]:
df.shape

(500000, 24)

In [5]:
df.isnull().sum().sum()

0

In [6]:
df.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
152271,7tidHiytndQ0KeZ3aHykff,A Place In The Choir,Celebrate The Difference,4FufsQX4af42WB73qnSKTa,['Terri Hendrix'],['7JSGrGrjakwMxvAXjHlni1'],11,1,False,0.788,...,0.027,0.657,0.0,0.0634,0.94,105.368,164907,3.0,2005,2005-12-07
798202,2SdegdTpWGahzegaR4oOta,"Piedigrotta 1924 - Rapsodia Napoletana, Op. 32...",The Eclectic Piano Music of Mario Castelnuovo-...,5rGDMIMNMmaR07Qnm1ppUc,"['Mario Castelnuovo-Tedesco', 'David Witten']","['28c3Va3dnSLY1av3ym8XT2', '03BCJvfdmVLy3v3HHg...",17,1,False,0.467,...,0.0387,0.984,0.868,0.112,0.131,123.262,289573,4.0,2018,2018-08-29
1016231,0mltY9vaTJbC4FWZ6JlaOT,Engine,Follow Me,4JqsS6Xy0UEdYbOogZyxh2,['Isac Elliot'],['3aD9K1zaLQ3G7yp9XV5E4D'],5,1,False,0.692,...,0.0479,0.00246,0.0,0.0866,0.499,128.036,234253,4.0,2014,2014-11-03
1098309,1cxWjhRxdSV9QNLJWbqhtv,123 I'm a G (Remix),Screwbaby: Drank Stains & Oil Spills,5nFwbmaHvkOHxs9DtT5wwX,"['Chris Ward', 'Big Hawk', 'Bun B', 'DJ Screw'...","['3f9RwFlXYTYCCq8qjzAHho', '5UZLYxzCWbloCIsXVq...",15,1,True,0.781,...,0.26,0.00267,0.0,0.272,0.535,160.103,255844,4.0,2020,2020-07-20
480003,160YgJnAU2pwB8fw1fWg63,Sky Kisses Earth (Sean Dinsmore Remix),Dakini Lounge: Prem Joshua Remixed,3POYtJ1eftAtmn3VoZkc7A,['Prem Joshua'],['1Bs9FqmJBHrAJN6DLFIPt1'],7,1,False,0.345,...,0.0705,0.0488,0.318,0.0823,0.935,161.963,304253,4.0,2003,2003


In [7]:
df['time_signature'].value_counts()

4.0    410834
3.0     67344
5.0     12570
1.0      8054
0.0      1198
Name: time_signature, dtype: int64

## NLP and creating new columns

In [8]:
df['artists_upd_v1'] = df['artists'].apply(lambda x: re.findall(r"'([^']*)'", x))
df['artists_upd_v2'] = df['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))
df['artists_upd'] = np.where(df['artists_upd_v1'].apply(lambda x: not x), df['artists_upd_v2'], df['artists_upd_v1'] )


In [9]:
#need to create my own song identifier because there are duplicates of the same song with different ids
df['artists_song'] = df.apply(lambda row: row['name'] + ' ' + 'by'+ ' ' + row['artists_upd'][0],axis = 1)

In [10]:
df.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,valence,tempo,duration_ms,time_signature,year,release_date,artists_upd_v1,artists_upd_v2,artists_upd,artists_song
152271,7tidHiytndQ0KeZ3aHykff,A Place In The Choir,Celebrate The Difference,4FufsQX4af42WB73qnSKTa,['Terri Hendrix'],['7JSGrGrjakwMxvAXjHlni1'],11,1,False,0.788,...,0.94,105.368,164907,3.0,2005,2005-12-07,[Terri Hendrix],[],[Terri Hendrix],A Place In The Choir by Terri Hendrix
798202,2SdegdTpWGahzegaR4oOta,"Piedigrotta 1924 - Rapsodia Napoletana, Op. 32...",The Eclectic Piano Music of Mario Castelnuovo-...,5rGDMIMNMmaR07Qnm1ppUc,"['Mario Castelnuovo-Tedesco', 'David Witten']","['28c3Va3dnSLY1av3ym8XT2', '03BCJvfdmVLy3v3HHg...",17,1,False,0.467,...,0.131,123.262,289573,4.0,2018,2018-08-29,"[Mario Castelnuovo-Tedesco, David Witten]",[],"[Mario Castelnuovo-Tedesco, David Witten]","Piedigrotta 1924 - Rapsodia Napoletana, Op. 32..."
1016231,0mltY9vaTJbC4FWZ6JlaOT,Engine,Follow Me,4JqsS6Xy0UEdYbOogZyxh2,['Isac Elliot'],['3aD9K1zaLQ3G7yp9XV5E4D'],5,1,False,0.692,...,0.499,128.036,234253,4.0,2014,2014-11-03,[Isac Elliot],[],[Isac Elliot],Engine by Isac Elliot
1098309,1cxWjhRxdSV9QNLJWbqhtv,123 I'm a G (Remix),Screwbaby: Drank Stains & Oil Spills,5nFwbmaHvkOHxs9DtT5wwX,"['Chris Ward', 'Big Hawk', 'Bun B', 'DJ Screw'...","['3f9RwFlXYTYCCq8qjzAHho', '5UZLYxzCWbloCIsXVq...",15,1,True,0.781,...,0.535,160.103,255844,4.0,2020,2020-07-20,"[Chris Ward, Big Hawk, Bun B, DJ Screw, Lil O,...",[],"[Chris Ward, Big Hawk, Bun B, DJ Screw, Lil O,...",123 I'm a G (Remix) by Chris Ward
480003,160YgJnAU2pwB8fw1fWg63,Sky Kisses Earth (Sean Dinsmore Remix),Dakini Lounge: Prem Joshua Remixed,3POYtJ1eftAtmn3VoZkc7A,['Prem Joshua'],['1Bs9FqmJBHrAJN6DLFIPt1'],7,1,False,0.345,...,0.935,161.963,304253,4.0,2003,2003,[Prem Joshua],[],[Prem Joshua],Sky Kisses Earth (Sean Dinsmore Remix) by Prem...


In [11]:
df['artists_song'][0]

'Testify by Rage Against The Machine'

In [12]:
# Reduce columns to 13 to match the song features that Spotify API will return to us plus column with Song and Artist name 
cols_to_drop = [
        "year",
        "release_date",
        "explicit",
        "disc_number",
        "track_number",
        "artist_ids",
        "artists",
        "album_id",
        "album",
        "name",
        "artists_upd",
        "artists_upd_v2",
        "artists_upd_v1",
        "id",
    ]
df = df.drop(cols_to_drop, axis=1)

In [13]:
df.shape

(500000, 14)

In [14]:
# DF with song features that Spotify API will return
features = df.drop(columns='artists_song')

In [15]:
features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
152271,0.788,0.65500,2,-4.357,1,0.0270,0.657000,0.0000,0.0634,0.9400,105.368,164907,3.0
798202,0.467,0.15900,2,-23.472,0,0.0387,0.984000,0.8680,0.1120,0.1310,123.262,289573,4.0
1016231,0.692,0.74300,0,-6.023,1,0.0479,0.002460,0.0000,0.0866,0.4990,128.036,234253,4.0
1098309,0.781,0.82100,2,-8.011,1,0.2600,0.002670,0.0000,0.2720,0.5350,160.103,255844,4.0
480003,0.345,0.86800,1,-6.206,0,0.0705,0.048800,0.3180,0.0823,0.9350,161.963,304253,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
744138,0.127,0.00721,7,-22.596,0,0.0365,0.606000,0.8850,0.1070,0.0381,64.354,367840,3.0
213615,0.149,0.10600,5,-23.138,0,0.0402,0.989000,0.0678,0.1010,0.0526,97.327,185507,4.0
599911,0.445,0.52800,10,-7.243,1,0.0744,0.677000,0.0000,0.1550,0.5110,203.155,223452,4.0
130588,0.536,0.83500,6,-6.046,1,0.0297,0.093800,0.0000,0.4130,0.8700,180.039,215320,4.0


In [23]:
# creating df with indexes and song by artist columns 
names = df[['artists_song']]

In [16]:
# Creating and fitting NearestNeighbors model
model = NearestNeighbors(n_neighbors=10, algorithm='brute')
model.fit(features)


NearestNeighbors(algorithm='brute', n_neighbors=10)

In [19]:
# indexes of recommended songs basen on features of the first song in our dataset
n_dist, n_ind = model.kneighbors(features.head(1))
ind = list(n_ind[0])

In [24]:
# 10 recommended songs
id_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
artist_song = []

for each in ind:
    artist_song.append(names.iloc[each]["artists_song"])

recommendations = list(zip(id_list, artist_song))

recommendations

[(1, 'A Place In The Choir by Terri Hendrix'),
 (2, 'These Are the Instructions by John Mann'),
 (3, 'Para Qué Quieres Volver - En Vivo by Miguel Angel Y Su Grupo Carino'),
 (4, 'Dying Day by Eric Metronome'),
 (5, 'Ultima Visio by Imago Mortis'),
 (6, 'Place to Call My Home by Dan Sartain'),
 (7, 'Robin (The Hooded Man) - Remastered by Clannad'),
 (8, 'Waves (feat. Simon Dominic & Jamie) by KANG DANIEL'),
 (9, 'Stop Playin by Young Mass'),
 (10, 'Big Bad Bill (Is Sweet William Now) - 2015 Remaster by Van Halen')]

In [42]:
# Creating a csv
names.to_csv(r'song_artist.csv')

In [49]:
# Saving our trained model
pickle.dump(model, open('Spotify_model_new', 'wb'))