In [1]:
#Import Packages

# Data analysis
import pandas as pd
import numpy as np
import requests

# Data cleaning
import re

# Tokenizing words
import spacy
from spacy.tokenizer import Tokenizer
from collections import Counter

# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Encoding
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
from sqlalchemy import create_engine
import numpy as np
from sklearn import preprocessing  # for category encoder
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from typing import List, Tuple


## Wrangle

In [3]:
spot = pd.read_csv('https://raw.githubusercontent.com/rowaishanna/sp/master/Spotifyfeatures_reducedsize.csv')
print(spot.shape)
spot.head()

(165331, 18)


Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,R&B,Mary J. Blige,Be Without You - Kendu Mix,2YegxR5As7BeQuVp2U6pek,65,0.083,0.724,246333,0.689,0.0,D,0.304,-5.922,Minor,0.135,146.496,4-Apr,0.693
1,R&B,Rihanna,Desperado,6KFaHC9G178beAp7P0Vi5S,63,0.323,0.685,186467,0.61,0.0,C,0.102,-5.221,Minor,0.0439,94.384,4-Mar,0.323
2,R&B,Yung Bleu,Ice On My Baby (feat. Kevin Gates) - Remix,6muW8cSjJ3rusKJ0vH5olw,62,0.0675,0.762,199520,0.52,4e-06,F,0.114,-5.237,Minor,0.0959,75.047,4-Apr,0.0862
3,R&B,Surfaces,Heaven Falls / Fall on Me,7yHqOZfsXYlicyoMt62yC6,61,0.36,0.563,240597,0.366,0.00243,B,0.0955,-6.896,Minor,0.121,85.352,4-Apr,0.768
4,R&B,Olivia O'Brien,Love Myself,4XzgjxGKqULifVf7mnDIQK,68,0.596,0.653,213947,0.621,0.0,B,0.0811,-5.721,Minor,0.0409,100.006,4-Apr,0.466


In [4]:
# Sample a fraction of the data set
spot = spot.sample(frac=.2, axis = 0)

# Label encode genre
le= LabelEncoder()

cols = ['genre', 'time_signature', 'mode', 'key']
for col in cols:
  spot[col] = le.fit_transform(spot[col])

# Copy dataframe
spot2=spot.copy()

In [5]:
# Combine text columns for tokenization
col_combine = ['artist_name', 'track_name']

# Lowercase and regex
for each in col_combine:
  spot2[each]= spot2[each].apply(lambda x:x.lower())
  spot2[each]= spot2[each].apply(lambda x: re.sub('[^a-zA-Z 0-9]', ' ', x))

# Combine two columns with text
spot2['combined_text'] = spot2['combined_text'] = spot2['artist_name'] + spot2['track_name'] 

# Remove repetitive columns
spot2= spot2.drop(['artist_name', 'track_name', 'track_id'], axis = 1)

# Take a subset with numerical columns
spot2_sub = spot2.drop(['combined_text'], axis = 1)

In [6]:
spot2.head()

Unnamed: 0,genre,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,combined_text
78968,12,62,0.0823,0.587,215360,0.671,0.0,5,0.161,-4.48,0,0.144,127.133,1,0.1,walerunning back feat lil wayne
136422,4,17,0.739,0.61,88200,0.57,0.0,8,0.826,-14.251,1,0.928,83.541,3,0.568,paul mooneyoj oj oj
143340,11,44,0.174,0.814,322027,0.38,0.21,8,0.0826,-13.307,1,0.0353,102.994,1,0.588,st germainmary l
85001,14,55,0.254,0.595,349598,0.578,0.0,4,0.37,-10.715,0,0.581,81.613,1,0.368,flatbush zombiesthe glory feat denzel curry
159367,18,50,0.33,0.738,246693,0.921,2e-06,3,0.066,-4.908,0,0.124,125.436,1,0.668,aretha franklinfreeway of love single mix


In [7]:
# Scale numerical columns

scaler = StandardScaler()
scaler.fit(spot2_sub)
scaled_df = pd.DataFrame(scaler.transform(spot2_sub))
print(scaled_df.shape)
scaled_df.head()

(33066, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.48092,0.992092,-0.738624,-0.055776,-0.20947,0.282189,-0.419891,-0.104513,-0.285994,0.810783,-0.759836,0.0301,0.305189,-0.345226,-1.555201
1,-1.068331,-1.938545,1.269878,0.084451,-1.376224,-0.150483,-0.419891,0.759873,2.966198,-1.080757,1.316074,3.844683,-1.14571,2.585518,0.353411
2,0.287263,-0.180163,-0.458162,1.328204,0.76925,-0.964419,0.394126,0.759873,-0.66941,-0.898011,1.316074,-0.498785,-0.498244,-0.345226,0.434976
3,0.868232,0.536215,-0.213483,-0.007001,1.022227,-0.116212,-0.419891,-0.392641,0.736124,-0.396233,-0.759836,2.156341,-1.209881,-0.345226,-0.462235
4,1.642857,0.210589,0.018961,0.864845,0.078025,1.353158,-0.419883,-0.680769,-0.750593,0.727928,-0.759836,-0.067211,0.248706,-0.345226,0.761234


### TFIDF

In [9]:
# Set up TFIDF
# Instantiate vectorizer object

def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

tfidf = TfidfVectorizer(
    stop_words = 'english',
    # tokenizer = tokenize,
    ngram_range = (1,1),
    min_df = 1, 
    max_df = 0.9,
    max_features = 1000)

In [12]:
# Create a vocabulary and tf-idf score per document
text = spot2['combined_text']
dtm = tfidf.fit_transform(text)

In [13]:
# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(33066, 1000)


Unnamed: 0,10,11,12,125,13,14,15,16,17,18,19,20,2001,2002,2005,2007,2009,2010,2011,2012,2013,2014,2015,2016,2017,21,23,24,25,27,28,30,31,32,35,40,50,aaron,acoustic,act,...,wild,william,williams,willie,wilson,wind,wine,wings,wish,wisin,wit,witt,wiz,wolf,wolfgang,woman,women,won,wonder,wood,words,work,world,wrong,xavier,ya,yeah,year,years,yellow,yo,york,young,yung,zac,zeds,zero,ziggy,zion,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
combined_df=pd.concat([scaled_df, dtm], axis = 1)
combined_df.shape

(33066, 1015)

# Similarity Recommender
## A.  Nearest Neighbors

In [12]:
nn = NearestNeighbors(n_neighbors=6)

In [13]:
nn.fit(combined_df)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                 radius=1.0)

In [14]:
# looking for similar song to 1st entry in DF
a = nn.kneighbors([combined_df.iloc[10].values])

In [15]:
# Distances of the first 5 from inquired song
a[0]

array([[0.        , 1.32411756, 1.58191366, 1.61112974, 1.63151354,
        1.669309  ]])

In [16]:
# Song identifiers
a[1]

array([[   10,  3338, 32269, 28760, 20422, 25368]])

In [17]:
# First Inquiry

print(spot['artist_name'].iloc[0], spot['track_name'].iloc[0])  # inquiry

print(spot['artist_name'].iloc[871], spot['track_name'].iloc[871])  # similar 1
print(spot['artist_name'].iloc[6184], spot['track_name'].iloc[6184])
print(spot['artist_name'].iloc[1619], spot['track_name'].iloc[1619])
print(spot['artist_name'].iloc[5323], spot['track_name'].iloc[5323])
print(spot['artist_name'].iloc[2538], spot['track_name'].iloc[2538])

De La Ghetto La Formula (feat. Chris Jeday)
Valentino Khan Lick It
JJ Grey & Mofro A Woman
Bruno Mars Talking to the Moon
Dominic Miller Étude
Comedian Bob Marley Select Serviceman


In [25]:
# Second Inquiry

print(spot['artist_name'].iloc[10], spot['track_name'].iloc[10])  # inquiry

print(spot['artist_name'].iloc[798], spot['track_name'].iloc[798])  # similar 1
print(spot['artist_name'].iloc[5196], spot['track_name'].iloc[5196])
print(spot['artist_name'].iloc[502], spot['track_name'].iloc[502])
print(spot['artist_name'].iloc[3980], spot['track_name'].iloc[3980])
print(spot['artist_name'].iloc[1617], spot['track_name'].iloc[1617])

Tisoki Gave You Love
What So Not Beautiful
Zomboy Young & Dangerous - EP Version
LOUDPVCK More Than I Can Take
Two Door Cinema Club Come Back Home
Death From Above 1979 Moonlight


# Similarity Recommender
## Cosine Similarity

In [14]:
# Calculate Distance of TF-IDF Vectors
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Calculate Distance of TF-IDF Vectors
dist_matrix  = cosine_similarity(dtm)

In [None]:
# Turn it into a DataFrame
cosine_df = pd.DataFrame(dist_matrix)
print(cosine_df.shape)
cosine_df.head()

(16533, 16533)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,16493,16494,16495,16496,16497,16498,16499,16500,16501,16502,16503,16504,16505,16506,16507,16508,16509,16510,16511,16512,16513,16514,16515,16516,16517,16518,16519,16520,16521,16522,16523,16524,16525,16526,16527,16528,16529,16530,16531,16532
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.378948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Verify it was added
leng = len(spot)-1
spot.iloc[leng]

genre                                           7
artist_name                                Tchami
track_name          Move Your Body (Future House)
track_id                   5fFEgmLAbNG75zKPn3hXOO
popularity                                     46
acousticness                              0.00424
danceability                                0.818
duration_ms                                219429
energy                                       0.87
instrumentalness                            0.196
key                                             0
liveness                                   0.0429
loudness                                   -5.162
mode                                            0
speechiness                                0.0493
tempo                                     123.999
time_signature                                  0
valence                                     0.741
Name: 25045, dtype: object

In [None]:
# # Grab the top 5 most similar strains to the custom strain at the start.
last_cosine = len(cosine_df)-1
cosine_results = cosine_df[cosine_df[0] < 1][last_cosine].sort_values(ascending=False)[1:6]
cosine_results =  pd.DataFrame(cosine_results)
cosine_results = cosine_results.reset_index()
cos_results = cosine_results['index'].values.tolist()
cos_results

[4678, 1803, 9873, 9472, 7437]

In [None]:
# Check results
print('----------------------------')
print('----------------------------')
print(f"Seed song:") 
print(f"{spot.iloc[leng]}")
print('----------------------------')
print('----------------------------')
print('Similar songs:')
print('----------------------------')
for each in cos_results:
  print(spot.iloc[each])

----------------------------
----------------------------
Seed song:
genre                                           7
artist_name                                Tchami
track_name          Move Your Body (Future House)
track_id                   5fFEgmLAbNG75zKPn3hXOO
popularity                                     46
acousticness                              0.00424
danceability                                0.818
duration_ms                                219429
energy                                       0.87
instrumentalness                            0.196
key                                             0
liveness                                   0.0429
loudness                                   -5.162
mode                                            0
speechiness                                0.0493
tempo                                     123.999
time_signature                                  0
valence                                     0.741
Name: 25045, dtype: object
----