In [1]:
import pandas as pd
import numpy as np
import json
import re
import sys
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

In [2]:
%matplotlib inline

In [3]:
#If you're not familiar with this, save it! Makes using jupyter notebook on laptops much easier
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [4]:
# #another useful command to make data exploration easier
# # NOTE: if you are using a massive dataset, this could slow down your code.
# pd.set_option('display.max_columns', None)
# pd.set_option("max_rows", None)

In [5]:
spotify_df = pd.read_csv('tracks.csv')

In [6]:
spotify_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [7]:
data_w_genre = pd.read_csv('artists.csv')
data_w_genre.head()

Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0


In [8]:
data_w_genre.dtypes

id             object
followers     float64
genres         object
name           object
popularity      int64
dtype: object

In [9]:
data_w_genre['genres'].values[0]

'[]'

In [10]:
data_w_genre['genres'].values[0][0]

'['

In [11]:
data_w_genre['genres_upd'] = data_w_genre['genres'].apply(lambda x: [re.sub(' ','_',i) for i in re.findall(r"'([^']*)'", x)])

In [12]:
data_w_genre['genres_upd'].values[0]

[]

In [13]:
data_w_genre.head()

Unnamed: 0,id,followers,genres,name,popularity,genres_upd
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0,[]
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0,[]
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0,[]
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0,[]
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0,[]


In [14]:
spotify_df['artists_upd_v1'] = spotify_df['artists'].apply(lambda x: re.findall(r"'([^']*)'", x))

In [15]:
spotify_df['artists_upd_v1'].values[0]

['Uli']

In [16]:
spotify_df['artists_upd_v1'].values[0][0]

'Uli'

In [17]:
spotify_df[spotify_df['artists_upd_v1'].apply(lambda x: not x)].head(5)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1
164,1xEEYhWxT4WhDQdxfPCT8D,Snake Rag,20,194533,0,"[""King Oliver's Creole Jazz Band""]",['08Zk65toyJllap1MnzljxZ'],1923,0.708,0.361,...,-11.764,0,0.0441,0.994,0.883,0.103,0.902,105.695,4,[]
170,3rauXVLOOM5BlxWqUcDpkg,Chimes Blues,14,170827,0,"[""King Oliver's Creole Jazz Band""]",['08Zk65toyJllap1MnzljxZ'],1923,0.546,0.189,...,-15.984,1,0.0581,0.996,0.908,0.339,0.554,80.318,4,[]
172,1UdqHVRFYMZKU2Q7xkLtYc,Pickin' On Your Baby,11,197493,0,"[""Clarence Williams' Blue Five""]",['6RuQvIr0t0otZHnAxXTGkm'],1923,0.52,0.153,...,-14.042,1,0.044,0.995,0.131,0.353,0.319,102.937,4,[]
174,0Vl2DO5U6FjgBpzCtBN3OA,Everybody Loves My Baby,10,152507,0,"[""Clarence Williams' Blue Five""]",['6RuQvIr0t0otZHnAxXTGkm'],1923,0.514,0.193,...,-13.92,0,0.238,0.996,0.199,0.248,0.665,180.674,4,[]
180,5SvyP1ZeJX1jA7AOZD08NA,Tears,10,187227,0,"[""King Oliver's Creole Jazz Band""]",['08Zk65toyJllap1MnzljxZ'],1923,0.359,0.357,...,-11.81,1,0.0511,0.994,0.819,0.29,0.753,205.053,4,[]


In [18]:
spotify_df['artists_upd_v2'] = spotify_df['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))
spotify_df['artists_upd'] = np.where(spotify_df['artists_upd_v1'].apply(lambda x: not x), spotify_df['artists_upd_v2'], spotify_df['artists_upd_v1'] )

In [19]:
spotify_df[spotify_df['artists_upd_v1'].apply(lambda x: not x)].head(5)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd
164,1xEEYhWxT4WhDQdxfPCT8D,Snake Rag,20,194533,0,"[""King Oliver's Creole Jazz Band""]",['08Zk65toyJllap1MnzljxZ'],1923,0.708,0.361,...,0.0441,0.994,0.883,0.103,0.902,105.695,4,[],[King Oliver's Creole Jazz Band],[King Oliver's Creole Jazz Band]
170,3rauXVLOOM5BlxWqUcDpkg,Chimes Blues,14,170827,0,"[""King Oliver's Creole Jazz Band""]",['08Zk65toyJllap1MnzljxZ'],1923,0.546,0.189,...,0.0581,0.996,0.908,0.339,0.554,80.318,4,[],[King Oliver's Creole Jazz Band],[King Oliver's Creole Jazz Band]
172,1UdqHVRFYMZKU2Q7xkLtYc,Pickin' On Your Baby,11,197493,0,"[""Clarence Williams' Blue Five""]",['6RuQvIr0t0otZHnAxXTGkm'],1923,0.52,0.153,...,0.044,0.995,0.131,0.353,0.319,102.937,4,[],[Clarence Williams' Blue Five],[Clarence Williams' Blue Five]
174,0Vl2DO5U6FjgBpzCtBN3OA,Everybody Loves My Baby,10,152507,0,"[""Clarence Williams' Blue Five""]",['6RuQvIr0t0otZHnAxXTGkm'],1923,0.514,0.193,...,0.238,0.996,0.199,0.248,0.665,180.674,4,[],[Clarence Williams' Blue Five],[Clarence Williams' Blue Five]
180,5SvyP1ZeJX1jA7AOZD08NA,Tears,10,187227,0,"[""King Oliver's Creole Jazz Band""]",['08Zk65toyJllap1MnzljxZ'],1923,0.359,0.357,...,0.0511,0.994,0.819,0.29,0.753,205.053,4,[],[King Oliver's Creole Jazz Band],[King Oliver's Creole Jazz Band]


In [20]:
spotify_df['artists_song'] = spotify_df.apply(lambda row: row['artists_upd'][0]+str(row['name']),axis = 1)

In [21]:
spotify_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,...,0.674,0.744,0.151,0.127,104.851,3,[Uli],[],[Uli],UliCarve
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,...,0.797,0.0,0.148,0.655,102.009,1,[Fernando Pessoa],[],[Fernando Pessoa],Fernando PessoaCapítulo 2.16 - Banquero Anarqu...
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,...,0.994,0.0218,0.212,0.457,130.418,5,[Ignacio Corsini],[],[Ignacio Corsini],Ignacio CorsiniVivo para Quererte - Remasterizado
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,...,0.995,0.918,0.104,0.397,169.98,3,[Ignacio Corsini],[],[Ignacio Corsini],Ignacio CorsiniEl Prisionero - Remasterizado
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,...,0.989,0.13,0.311,0.196,103.22,4,[Dick Haymes],[],[Dick Haymes],Dick HaymesLady of the Evening


In [22]:
spotify_df.drop_duplicates('artists_song',inplace = True)

In [23]:
spotify_df[spotify_df['name']=='Adore You']

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song
86217,5AnCLGg35ziFOloEnXK4uu,Adore You,71,278747,0,['Miley Cyrus'],['5YGY8feqx7naU7z4HrwZM6'],2013-10-04,0.583,0.655,...,0.111,4e-06,0.113,0.201,119.759,4,[Miley Cyrus],[],[Miley Cyrus],Miley CyrusAdore You
91884,3jjujdWJ72nww5eGnfs2E7,Adore You,88,207133,0,['Harry Styles'],['6KImCVD70vtIoJWnq6nGn3'],2019-12-13,0.676,0.771,...,0.0237,7e-06,0.102,0.569,99.048,4,[Harry Styles],[],[Harry Styles],Harry StylesAdore You


In [24]:
artists_exploded = spotify_df[['artists_upd','id']].explode('artists_upd')

In [25]:
spotify_df[['artists_upd','id']]

Unnamed: 0,artists_upd,id
0,[Uli],35iwgR4jXetI318WEWsa1Q
1,[Fernando Pessoa],021ht4sdgPcrDgSk7JTbKY
2,[Ignacio Corsini],07A5yehtSnoedViJAZkNnc
3,[Ignacio Corsini],08FmqUhxtyLTn6pAh6bk45
4,[Dick Haymes],08y9GfoqCWfOGsKdwojr5e
...,...,...
586666,[Emilie Chin],1ZwZsVZUiyFwIHMNpI3ERt
586667,[阿YueYue],5rgu12WBIHQtvej2MdHSH0
586668,[ROLE MODEL],0NuWgxEp51CutD2pJoF4OM
586669,[FINNEAS],27Y1N4Q4U3EfDU5Ubw8ws2


In [26]:
artists_exploded

Unnamed: 0,artists_upd,id
0,Uli,35iwgR4jXetI318WEWsa1Q
1,Fernando Pessoa,021ht4sdgPcrDgSk7JTbKY
2,Ignacio Corsini,07A5yehtSnoedViJAZkNnc
3,Ignacio Corsini,08FmqUhxtyLTn6pAh6bk45
4,Dick Haymes,08y9GfoqCWfOGsKdwojr5e
...,...,...
586667,阿YueYue,5rgu12WBIHQtvej2MdHSH0
586668,ROLE MODEL,0NuWgxEp51CutD2pJoF4OM
586669,FINNEAS,27Y1N4Q4U3EfDU5Ubw8ws2
586670,Gentle Bones,45XJsGpFTyzbzeWK8VzR8S


In [27]:
artists_exploded_enriched = artists_exploded.merge(data_w_genre, how = 'left', left_on = 'artists_upd',right_on = 'name')
artists_exploded_enriched_nonnull = artists_exploded_enriched[~artists_exploded_enriched.genres_upd.isnull()]

In [28]:
artists_exploded_enriched

Unnamed: 0,artists_upd,id_x,id_y,followers,genres,name,popularity,genres_upd
0,Uli,35iwgR4jXetI318WEWsa1Q,45tIt06XoI0Iio4LBEVpls,91.0,[],Uli,4.0,[]
1,Uli,35iwgR4jXetI318WEWsa1Q,2rgWCHFdRPECZBp9s9gU93,28721.0,[],Uli,18.0,[]
2,Fernando Pessoa,021ht4sdgPcrDgSk7JTbKY,14jtPCOoNZwquk5wd9DxrY,3.0,[],Fernando Pessoa,0.0,[]
3,Fernando Pessoa,021ht4sdgPcrDgSk7JTbKY,13RMReWFUS7foJYInOfD6E,527.0,[],Fernando Pessoa,8.0,[]
4,Ignacio Corsini,07A5yehtSnoedViJAZkNnc,5LiOoJbxVSAMkBS2fUm3X2,3528.0,"['tango', 'vintage tango']",Ignacio Corsini,23.0,"[tango, vintage_tango]"
...,...,...,...,...,...,...,...,...
762761,阿YueYue,5rgu12WBIHQtvej2MdHSH0,1QLBXKM5GCpyQQSVMNZqrZ,896.0,['chinese viral pop'],阿YueYue,38.0,[chinese_viral_pop]
762762,ROLE MODEL,0NuWgxEp51CutD2pJoF4OM,1dy5WNgIKQU6ezkpZs4y8z,245944.0,"['alt z', 'alternative r&b', 'bedroom pop', 'i...",ROLE MODEL,67.0,"[alt_z, alternative_r&b, bedroom_pop, indie_ca..."
762763,FINNEAS,27Y1N4Q4U3EfDU5Ubw8ws2,37M5pPGs6V1fchFJSgCguX,1168213.0,"['alt z', 'electropop', 'indie pop', 'la indie...",FINNEAS,77.0,"[alt_z, electropop, indie_pop, la_indie, pop, ..."
762764,Gentle Bones,45XJsGpFTyzbzeWK8VzR8S,4jGPdu95icCKVF31CcFKbS,45309.0,"['chill r&b', 'indie cafe pop', 'singaporean p...",Gentle Bones,58.0,"[chill_r&b, indie_cafe_pop, singaporean_pop]"


In [29]:
artists_exploded_enriched_nonnull[artists_exploded_enriched_nonnull['id_x'] =='6KuQTIu1KoTTkLXKrwlLPV']

Unnamed: 0,artists_upd,id_x,id_y,followers,genres,name,popularity,genres_upd
6153,Robert Schumann,6KuQTIu1KoTTkLXKrwlLPV,2UqjDAXnDxejEyE0CzfUrZ,423826.0,"['classical', 'early romantic era', 'german ro...",Robert Schumann,64.0,"[classical, early_romantic_era, german_romanti..."
6154,Vladimir Horowitz,6KuQTIu1KoTTkLXKrwlLPV,4Ws5hSoABAwvGJ4LhHwHgq,92365.0,"['classical', 'classical performance', 'classi...",Vladimir Horowitz,54.0,"[classical, classical_performance, classical_p..."


In [30]:
artists_genres_consolidated = artists_exploded_enriched_nonnull.groupby('id_x')['genres_upd'].apply(list).reset_index()

In [31]:
artists_genres_consolidated['consolidates_genre_lists'] = artists_genres_consolidated['genres_upd'].apply(lambda x: list(set(list(itertools.chain.from_iterable(x)))))

In [32]:
artists_genres_consolidated.head()

Unnamed: 0,id_x,genres_upd,consolidates_genre_lists
0,0004Uy71ku11n3LMpuyf59,[[polish_rock]],[polish_rock]
1,000CSYu4rvd8cQ7JilfxhZ,"[[country_quebecois, rock_quebecois]]","[country_quebecois, rock_quebecois]"
2,000DsoWJKHdaUmhgcnpr8j,[[barnmusik]],[barnmusik]
3,000G1xMMuwxNHmwVsBdtj1,"[[candy_pop, new_wave, new_wave_pop, permanent...","[permanent_wave, new_wave_pop, new_wave, power..."
4,000KblXP5csWFFFsD6smOy,"[[chamame, folclore_salteno, folklore_argentino]]","[folclore_salteno, folklore_argentino, chamame]"


In [33]:
artists_genres_consolidated = artists_genres_consolidated.rename(columns={'id_x':'id'})

In [34]:
artists_genres_consolidated.head()

Unnamed: 0,id,genres_upd,consolidates_genre_lists
0,0004Uy71ku11n3LMpuyf59,[[polish_rock]],[polish_rock]
1,000CSYu4rvd8cQ7JilfxhZ,"[[country_quebecois, rock_quebecois]]","[country_quebecois, rock_quebecois]"
2,000DsoWJKHdaUmhgcnpr8j,[[barnmusik]],[barnmusik]
3,000G1xMMuwxNHmwVsBdtj1,"[[candy_pop, new_wave, new_wave_pop, permanent...","[permanent_wave, new_wave_pop, new_wave, power..."
4,000KblXP5csWFFFsD6smOy,"[[chamame, folclore_salteno, folklore_argentino]]","[folclore_salteno, folklore_argentino, chamame]"


In [35]:
spotify_df = spotify_df.merge(artists_genres_consolidated[['id','consolidates_genre_lists']], on = 'id',how = 'left')

In [36]:
spotify_df.tail()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists
523470,1ZwZsVZUiyFwIHMNpI3ERt,Skyscraper,4,106002,0,['Emilie Chin'],['4USdOnfLczwUglA3TrdHs2'],2020-02-08,0.626,0.53,...,0.856,0.104,0.215,120.113,4,[Emilie Chin],[],[Emilie Chin],Emilie ChinSkyscraper,
523471,5rgu12WBIHQtvej2MdHSH0,云与海,50,258267,0,['阿YueYue'],['1QLBXKM5GCpyQQSVMNZqrZ'],2020-09-26,0.56,0.518,...,0.0,0.0648,0.211,131.896,4,[阿YueYue],[],[阿YueYue],阿YueYue云与海,[chinese_viral_pop]
523472,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,['ROLE MODEL'],['1dy5WNgIKQU6ezkpZs4y8z'],2020-10-21,0.765,0.663,...,0.000297,0.0924,0.686,150.091,4,[ROLE MODEL],[],[ROLE MODEL],ROLE MODELblind,"[bedroom_pop, alt_z, indie_cafe_pop, pop, indi..."
523473,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,['FINNEAS'],['37M5pPGs6V1fchFJSgCguX'],2020-09-02,0.535,0.314,...,0.00015,0.0874,0.0663,145.095,4,[FINNEAS],[],[FINNEAS],FINNEASWhat They'll Say About Us,"[alt_z, post-teen_pop, pop, la_indie, electrop..."
523474,45XJsGpFTyzbzeWK8VzR8S,A Day At A Time,58,142003,0,"['Gentle Bones', 'Clara Benin']","['4jGPdu95icCKVF31CcFKbS', '5ebPSE9YI5aLeZ1Z2g...",2021-03-05,0.696,0.615,...,3e-06,0.305,0.438,90.029,4,"[Gentle Bones, Clara Benin]",[],"[Gentle Bones, Clara Benin]",Gentle BonesA Day At A Time,"[opm, indie_cafe_pop, singaporean_pop, chill_r..."


In [37]:
spotify_df['consolidates_genre_lists'] = spotify_df['consolidates_genre_lists'].apply(lambda x:"NaN" if x == [] else x)

In [38]:
spotify_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,...,0.744,0.151,0.127,104.851,3,[Uli],[],[Uli],UliCarve,
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,...,0.0,0.148,0.655,102.009,1,[Fernando Pessoa],[],[Fernando Pessoa],Fernando PessoaCapítulo 2.16 - Banquero Anarqu...,
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,...,0.0218,0.212,0.457,130.418,5,[Ignacio Corsini],[],[Ignacio Corsini],Ignacio CorsiniVivo para Quererte - Remasterizado,"[tango, vintage_tango]"
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,...,0.918,0.104,0.397,169.98,3,[Ignacio Corsini],[],[Ignacio Corsini],Ignacio CorsiniEl Prisionero - Remasterizado,"[tango, vintage_tango]"
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,...,0.13,0.311,0.196,103.22,4,[Dick Haymes],[],[Dick Haymes],Dick HaymesLady of the Evening,"[easy_listening, adult_standards, big_band, lo..."


In [39]:
spotify_df['year'] = spotify_df['release_date'].apply(lambda x: x.split('-')[0])

In [40]:
float_cols = spotify_df.dtypes[spotify_df.dtypes == 'float64'].index.values

In [41]:
spotify_df['popularity'].describe()

count    523475.000000
mean         27.518993
std          18.306436
min           0.000000
25%          13.000000
50%          27.000000
75%          40.000000
max         100.000000
Name: popularity, dtype: float64

In [42]:
# create 5 point buckets for popularity
spotify_df['popularity_red'] = spotify_df['popularity'].apply(lambda x: int(x/5))

In [43]:
# tfidf can't handle nulls so fill any null values with an empty list
spotify_df['consolidates_genre_lists'] = spotify_df['consolidates_genre_lists'].apply(lambda d: d if isinstance(d, list) else [])

In [44]:
spotify_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists,year,popularity_red
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,...,0.127,104.851,3,[Uli],[],[Uli],UliCarve,[],1922,1
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,...,0.655,102.009,1,[Fernando Pessoa],[],[Fernando Pessoa],Fernando PessoaCapítulo 2.16 - Banquero Anarqu...,[],1922,0
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,...,0.457,130.418,5,[Ignacio Corsini],[],[Ignacio Corsini],Ignacio CorsiniVivo para Quererte - Remasterizado,"[tango, vintage_tango]",1922,0
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,...,0.397,169.98,3,[Ignacio Corsini],[],[Ignacio Corsini],Ignacio CorsiniEl Prisionero - Remasterizado,"[tango, vintage_tango]",1922,0
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,...,0.196,103.22,4,[Dick Haymes],[],[Dick Haymes],Dick HaymesLady of the Evening,"[easy_listening, adult_standards, big_band, lo...",1922,0


In [45]:
period = [str(i) for i in range(2000,2022)]
draft = spotify_df[spotify_df['year'].isin(period)]

In [46]:
draft.shape

(186524, 27)

In [47]:
draft.to_csv('draft.csv')

In [62]:
#simple function to create OHE features
#this gets passed later on
def ohe_prep(df, column, new_name):
    """
    Create One Hot Encoded features of a specific column

    Parameters:
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used

    Returns:
        tf_df: One hot encoded features
    """

    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)
    return tf_df

In [78]:
#function to build entire feature set
def create_feature_set(df, float_cols):
    """
    Process spotify df to create a final set of features that will be used to generate recommendations

    Parameters:
        df (pandas dataframe): Spotify Dataframe
        float_cols (list(str)): List of float columns that will be scaled

    Returns:
        final: final set of features
    """

    #tfidf genre lists
    tfidf = TfidfVectorizer(stop_words='english',min_df=0.005)
    tfidf_matrix =  tfidf.fit_transform(df['consolidates_genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)

    #explicity_ohe = ohe_prep(df, 'explicit','exp')
    year_ohe = ohe_prep(df, 'year','year') * 0.5
    popularity_ohe = ohe_prep(df, 'popularity_red','pop') * 0.15

    #scale float columns
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled, popularity_ohe, year_ohe], axis = 1)

    #add song id
    final['id']=df['id'].values

    return final

In [79]:
complete_feature_set = create_feature_set(draft, float_cols=float_cols)#.mean(axis = 0)

In [80]:
complete_feature_set.head()

Unnamed: 0,genre|adult_standards,genre|album_rock,genre|alternative_metal,genre|alternative_rock,genre|anime,genre|argentine_rock,genre|brazilian_rock,genre|bulgarian_pop,genre|cantautor,genre|cantopop,...,year|2013,year|2014,year|2015,year|2016,year|2017,year|2018,year|2019,year|2020,year|2021,id
0,0.619261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6catF1lDhNTjjGa2GxRQNN
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,6Pkt6qVikqPBt9bEQy8iTz
2,0.619261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4aSw1QJIMwYSoDEgzgdCJL
3,0.619261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0ZMMtH875IR2TfkyC4PolD
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,1hx7X9cMXHWJjknb9O6Ava


In [81]:
complete_feature_set.shape

(186524, 180)

In [82]:
complete_feature_set.to_csv("complete_feature_set.csv")