# Data Collection

This is the 1st part of the analysis that looks into the network of genre tags in spotify. In this part, I use the Spotify API to download the data and shape it as we need for the analysis

In [1]:
import sys
import spotipy
import spotipy.util as util
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
import itertools
client_credentials_manager = SpotifyClientCredentials('40cd9cd27c7c4689bc36774f5aac188b','10d3a4b01aea4976ac89c831db901a6d')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=20)

We need an unbiased and large enough sample of all artists in spotify. Spotify's own account on spotify featuring playlists made my Spotify's staff and algorithms is a good place to start, as they have the most number of public playlists on the platform covering a variety of genres and eras. So first, let's print collect all playlists created by Spotify's team. There's a lot of genre, mood and era specific plalists, along with "This is:" playlists that focus on one particular artists big hits

In [2]:
playlists = sp.user_playlists('spotify')
playlist_ids = []
while playlists:
    for i, playlist in enumerate(playlists['items']):
        playlist_ids.append(playlist['id'])
    if playlists['next']:
        playlists = sp.next(playlists)
    else:
        playlists = None

The next step is to get the data on all the tracks in each of these plalists and collect data on the artists of these tracks as well. we drop all duplicate artists and convert the data into a dataframe with approx. 37000 artists

In [3]:
trackslist = [] 
for j,i in enumerate(playlist_ids):
#     if j%50 == 0:
#         print(j)
    tracks = sp.playlist_tracks(i, limit = 100)
    trackslist.append(tracks['items'])
    while tracks['next']:
        tracks = sp.next(tracks)
        trackslist.append(tracks['items'])

In [4]:
tracks = [j['track'] for i in trackslist for j in i]
artistids = [j['id'] for i in tracks if i!=None for j in i['artists']]
artistids = pd.Series(artistids).drop_duplicates()
artistids = artistids.dropna()
artists = sp.artists(artistids[:50])['artists']
for i in range(50,len(artistids),50):
    artists.append(sp.artists(artistids[i:(i+50)])['artists'])
artists = artists[:50] + [j for i in artists[50:] for j in i]

In [5]:
for j in range(len(artists)):
        artists[j]['followers'] = artists[j]['followers']['total']
artists = pd.DataFrame.from_dict(artists)
artists = artists.drop_duplicates(subset = ['uri'])
len(artists)

36586

To make sure we have even more artists in our sample set, I use the artist_related_artists() function, which returns up to 20 related artists for every artist I pass as input. I also shape the data for a different network in the meantime, where, if 2 artists show up as related, we build an edge between them, this is for a network of artists rather than a network of genres. Now I have over 8-k unique artists in my database with over 4000 unique genre tags. 

In [8]:
related_artists = []
for j, i in enumerate(artistids):
    related_artists.append(sp.artist_related_artists(i))
related_artists = [i for i in related_artists if i!= None]
related_artists = [i['artists'] for i in related_artists]
len(related_artists)

36586

In [9]:
relart = related_artists
for i in range(len(relart)):
    if relart[i]!=[]:
        for j in range(len(relart[i])):
            relart[i][j].update(source_id = artistids.iloc[i])
relart = [j for i in relart for j in i]
for i in range(len(relart)):
    relart[i]['followers'] = relart[i]['followers']['total']
relart = pd.DataFrame.from_dict(relart)

In [10]:
all_artist_data = pd.merge(artists, relart, left_on = 'id', right_on = 'source_id',suffixes=('_source', '_related'))
all_artist_data = all_artist_data.drop_duplicates(subset = ['source_id','id_related'])
all_artist_data = all_artist_data[['followers_source','genres_source','id_source','name_source','popularity_source',
                                  'uri_source','followers_related', 'genres_related','id_related', 'name_related',
                                   'popularity_related','uri_related']]
all_artist_data.to_csv('all_artist_data_final.csv')

In [11]:
artists = artists[[ 'followers', 'genres','id','name','popularity','uri']]
artists.to_csv('artist_data_final.csv')
all_artist_data.head()

Unnamed: 0,followers_source,genres_source,id_source,name_source,popularity_source,uri_source,followers_related,genres_related,id_related,name_related,popularity_related,uri_related
0,2252439,"[afrobeats, nigerian pop]",46pWGuE3dSwY3bMMXGBvVS,Rema,82,spotify:artist:46pWGuE3dSwY3bMMXGBvVS,1247387,[afrobeats],3ZpEKRjHaHANcpk10u6Ntq,Ayra Starr,76,spotify:artist:3ZpEKRjHaHANcpk10u6Ntq
1,2252439,"[afrobeats, nigerian pop]",46pWGuE3dSwY3bMMXGBvVS,Rema,82,spotify:artist:46pWGuE3dSwY3bMMXGBvVS,1189961,"[afrobeats, nigerian pop]",0a1SidMjD8D6EHvJph4n2H,Ruger,65,spotify:artist:0a1SidMjD8D6EHvJph4n2H
2,2252439,"[afrobeats, nigerian pop]",46pWGuE3dSwY3bMMXGBvVS,Rema,82,spotify:artist:46pWGuE3dSwY3bMMXGBvVS,544095,"[afrobeats, afropop, nigerian hip hop, nigeria...",1fYVmAFB7sC7eDoF3mJXla,Wande Coal,60,spotify:artist:1fYVmAFB7sC7eDoF3mJXla
3,2252439,"[afrobeats, nigerian pop]",46pWGuE3dSwY3bMMXGBvVS,Rema,82,spotify:artist:46pWGuE3dSwY3bMMXGBvVS,1325382,"[afrobeats, afropop, azonto, nigerian hip hop,...",4ovtyvs7j1jSmwhkBGHqSr,Olamide,64,spotify:artist:4ovtyvs7j1jSmwhkBGHqSr
4,2252439,"[afrobeats, nigerian pop]",46pWGuE3dSwY3bMMXGBvVS,Rema,82,spotify:artist:46pWGuE3dSwY3bMMXGBvVS,417985,"[afrobeats, afropop, nigerian pop]",2NtQA3PY9chI8l65ejZLTP,SPINALL,59,spotify:artist:2NtQA3PY9chI8l65ejZLTP


In [12]:
relart  = relart[[ 'followers', 'genres','id','name','popularity','uri']]
allart = relart.append(artists)
allart = allart.drop_duplicates(subset = ['id'])


print(allart.head())
allart.columns = ['followers', 'genres','spotify id','Label','popularity','uri']
strgnrs = ['_'.join(i) for i in allart['genres']]
allart['strgnrs'] = strgnrs

  allart = relart.append(artists)


   followers                                             genres  \
0    1247387                                        [afrobeats]   
1    1189961                          [afrobeats, nigerian pop]   
2     544095  [afrobeats, afropop, nigerian hip hop, nigeria...   
3    1325382  [afrobeats, afropop, azonto, nigerian hip hop,...   
4     417985                 [afrobeats, afropop, nigerian pop]   

                       id        name  popularity  \
0  3ZpEKRjHaHANcpk10u6Ntq  Ayra Starr          76   
1  0a1SidMjD8D6EHvJph4n2H       Ruger          65   
2  1fYVmAFB7sC7eDoF3mJXla  Wande Coal          60   
3  4ovtyvs7j1jSmwhkBGHqSr     Olamide          64   
4  2NtQA3PY9chI8l65ejZLTP     SPINALL          59   

                                     uri  
0  spotify:artist:3ZpEKRjHaHANcpk10u6Ntq  
1  spotify:artist:0a1SidMjD8D6EHvJph4n2H  
2  spotify:artist:1fYVmAFB7sC7eDoF3mJXla  
3  spotify:artist:4ovtyvs7j1jSmwhkBGHqSr  
4  spotify:artist:2NtQA3PY9chI8l65ejZLTP  


In [13]:
allart.to_csv('artists_nodes_gephi.csv', index = False)

In [14]:
all_artist_data.columns = ['followers_source','genres_source','id_source','Source','popularity_source',
                                  'uri_source','followers_related', 'genres_related','id_related', 'Target',
                                   'popularity_related','uri_related']
all_artist_data.to_csv('artists_edges_gephi.csv', index = False)

Since I now have more data on artists (including related artists) than just the genres, I also make subsets of the artists data that fall under a few example genres. Any artist that was tagged with 'funk' would be included in this dataset, I will use gephi to understand communities of related artists this way. This isn't particularly scientific, but as a music nerd, it's pretty fun to see! the nodes are sized according to the number of spotify followers they have. The node colors are according to a community detection algorithm implemented by gephi. 



### Funk
![image.png](funk.png)

In [15]:
directed_artist = all_artist_data.reset_index(drop = True)
source_mask = directed_artist['genres_source'].apply(lambda x: 'funk' in x)
funk = directed_artist.loc[source_mask]
funk.to_csv('funk_source.csv',index = False)
directed_artist = allart.reset_index(drop = True)
source_mask = directed_artist['genres'].apply(lambda x: 'funk' in x)
funknodes = directed_artist.loc[source_mask]
funknodes['Id'] = funknodes['Label'].copy()
funknodes.to_csv('funk_source_nodes.csv',index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  funknodes['Id'] = funknodes['Label'].copy()


### Psych Rock
![image.png](psychrock.png)

In [16]:
directed_artist = all_artist_data.reset_index(drop = True)
source_mask = directed_artist['genres_source'].apply(lambda x: 'psychedelic rock' in x)
psychrock = directed_artist.loc[source_mask]
psychrock.to_csv('psychrock_source.csv',index = False)
directed_artist = allart.reset_index(drop = True)
source_mask = directed_artist['genres'].apply(lambda x: 'psychedelic rock' in x)
psychrocknodes = directed_artist.loc[source_mask]
psychrocknodes['Id'] = psychrocknodes['Label'].copy()
psychrocknodes.to_csv('psychrocknodes_source_nodes.csv',index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  psychrocknodes['Id'] = psychrocknodes['Label'].copy()


### Classic Rock

![image.png](classicrock.png)

In [17]:
directed_artist = all_artist_data.reset_index(drop = True)
source_mask = directed_artist['genres_source'].apply(lambda x: 'classic rock' in x)
classicrock = directed_artist.loc[source_mask]
classicrock.to_csv('classicrock_source.csv',index = False)
directed_artist = allart.reset_index(drop = True)
source_mask = directed_artist['genres'].apply(lambda x: 'classic rock' in x)
classicrocknodes = directed_artist.loc[source_mask]
classicrocknodes['Id'] = classicrocknodes['Label'].copy()
classicrocknodes.to_csv('classicrock_source_nodes.csv',index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classicrocknodes['Id'] = classicrocknodes['Label'].copy()


Finally, I seperate the genre tags and create a adjacency matrix that counts how many times each combination of 2 genres has occured in the netwrok, it is a pretty matrix as the density of network is pretty low. Part 2 contains the analysis of the larger Genres network along with important metrics and comparing community detection algorithms

In [18]:
gnrs = [i for i in allart['genres'] if i!= []]
c = [list(itertools.combinations(i,2)) for i in gnrs]
a = list(itertools.chain.from_iterable((i, i[::-1]) for c_ in c for i in c_))
dfa = pd.DataFrame(a)
dfgenrs = pd.pivot_table(dfa, index=0, columns=1, aggfunc='size', fill_value=0)
print(a[0])

('afrobeats', 'nigerian pop')


In [19]:
dfa.to_csv('network_nodes_final.csv',index = False)
dfgenrs.to_csv('network_matrix_final.csv', index = False)