## INST414 | Assignment 2 - Module 2 | Daniel Hernandez Gonzalez

In [42]:
#Network x tools.
import networkx as nx

# Tool to make json responses into viewable tables
import pandas as pd

#spotify tools.
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Tools for making calculations
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Authenticating with Spotify API

In [17]:
# Authentication keys, taken out from my final push.
client_id = '****'
client_secret = '****'
username = '****'

# set up Spotify API credentials
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# Gathering data

In [28]:
# First set of 50 songs, search query and parameters
query = 'year:2022'
limit = 50
offset = 0

# Spotify API call
results = sp.search(q=query, type='track', limit=limit, offset=offset)

# Second set of 50 songs, search query and parameters
query2 = 'year:2022'
limit = 50
offset = 50

# Spotify API call
results2 = sp.search(q=query, type='track', limit=limit, offset=offset)

In [29]:
# Combine the two sets of results into a single dictionary
all_results = {'tracks': {'items': []}}
all_results['tracks']['items'].extend(results['tracks']['items'])
all_results['tracks']['items'].extend(results2['tracks']['items'])

In [30]:
# Creating an empty DataFrame with columns.
song_info_df = pd.DataFrame(columns=['track_id', 'track_name', 'artist_name', 'tempo', 'popularity'])

# Iterating over the tracks to add data to song_info_df
for track in all_results['tracks']['items']:
    track_id = track['id']
    artist_names = [artist['name'] for artist in track['album']['artists']]
    artist_name = ', '.join(artist_names)
    track_name = track['name']
    tempo = None
    popularity = track['popularity']
    song_info_df.loc[len(song_info_df)] = [track_id, track_name, artist_name, tempo, popularity]
    
# Print out table
song_info_df


Unnamed: 0,track_id,track_name,artist_name,tempo,popularity
0,2dHHgzDwk4BJdRwy9uXhTO,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,,97
1,1Qrg8KqiBpW07V7PNxwwwL,Kill Bill,SZA,,95
2,4FyesJzVpA39hbYvcseO2d,Just Wanna Rock,Lil Uzi Vert,,91
3,0vjeOZ3Ft5jvAi9SBFJm1j,Superhero (Heroes & Villains) [with Future & C...,Metro Boomin,,91
4,1bDbXMyjaUIooNwFE9wn0N,Rich Flex,"Drake, 21 Savage",,91
...,...,...,...,...,...
95,2tTmW7RDtMQtBk7m2rYeSw,"Quevedo: Bzrp Music Sessions, Vol. 52","Bizarrap, Quevedo",,93
96,37F7E7BKEw2E4O2L7u0IEp,Limbo,Freddie Dredd,,85
97,0lEjxUUlKqjqXrVlIHFduD,P power (feat. Drake),Gunna,,80
98,0IKeDy5bT9G0bA7ZixRT4A,Bebe Dame,Fuerza Regida,,85


### Tempo needs an alternative call to Spotify for song audio data

In [31]:
# Add tempo column to dataframe
def get_tempo(track_id):
    features = sp.audio_features(track_id)

    # This is to avoid some Traceback errors I was getting
    if features is not None and len(features) > 0 and features[0] is not None:
        return features[0]['tempo']
    else:
        return None

song_info_df['tempo'] = song_info_df['track_id'].apply(get_tempo)
song_info_df

Unnamed: 0,track_id,track_name,artist_name,tempo,popularity
0,2dHHgzDwk4BJdRwy9uXhTO,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,97.950,97
1,1Qrg8KqiBpW07V7PNxwwwL,Kill Bill,SZA,88.980,95
2,4FyesJzVpA39hbYvcseO2d,Just Wanna Rock,Lil Uzi Vert,150.187,91
3,0vjeOZ3Ft5jvAi9SBFJm1j,Superhero (Heroes & Villains) [with Future & C...,Metro Boomin,116.622,91
4,1bDbXMyjaUIooNwFE9wn0N,Rich Flex,"Drake, 21 Savage",153.150,91
...,...,...,...,...,...
95,2tTmW7RDtMQtBk7m2rYeSw,"Quevedo: Bzrp Music Sessions, Vol. 52","Bizarrap, Quevedo",128.033,93
96,37F7E7BKEw2E4O2L7u0IEp,Limbo,Freddie Dredd,74.987,85
97,0lEjxUUlKqjqXrVlIHFduD,P power (feat. Drake),Gunna,119.251,80
98,0IKeDy5bT9G0bA7ZixRT4A,Bebe Dame,Fuerza Regida,157.643,85


In [32]:
# Drops a row missing a value, I was getting errors later on about NaN or null values.
print(f"{song_info_df.shape}\n")

print(song_info_df.isna().sum())
song_info_df.dropna(inplace=True)

print(f"\n{song_info_df.shape}")

(100, 5)

track_id       0
track_name     0
artist_name    0
tempo          1
popularity     0
dtype: int64

(99, 5)


# Creating network with NetworkX

In [33]:
# Create an empty graph
g = nx.Graph()

# Adding tracks to the graph as nodes.
for _, row in song_info_df.iterrows():
    g.add_node(row['track_name'])

# Iterating over the tracks in song_info_df to add edges
for i, row1 in song_info_df.iterrows():         #list1
    for i, row2 in song_info_df.iterrows():     #list2
        if row1['track_name'] != row2['track_name']:
            if row1['artist_name'] == row2['artist_name'] or abs(row1['tempo'] - row2['tempo']) <= 5:
                g.add_edge(row1['track_name'], row2['track_name'])
            else:
                # Checking if additional artists are the same. This is in hopes of making edges more meaningful.
                if 'featuring' in row1['track_name'] and 'featuring' in row2['track_name']:
                    artists1 = [a.strip() for a in row1['track_name'].split('featuring')[1].split(',')]
                    artists2 = [a.strip() for a in row2['track_name'].split('featuring')[1].split(',')]
                    if set(artists1) & set(artists2):
                        g.add_edge(row1['track_name'], row2['track_name'])

print(f"Number of nodes in graph: {g.number_of_nodes()}")
print(f"Number of edges in graph: {g.number_of_edges()}")

Number of nodes in graph: 94
Number of edges in graph: 527


In [34]:
# Export graph for use in Gephi
nx.write_gexf(g, "top_100_songs_2022.gexf")


# Similarity calculations

### The question I am trying to answer: Is it possible that tempo influences popularity?

In [43]:
# Finding the highest and lowest tempo songs
highest_tempo_track = song_info_df.loc[song_info_df['tempo'].idxmax()]
lowest_tempo_track = song_info_df.loc[song_info_df['tempo'].idxmin()]

# Create feature df for cosine similarity math
features = song_info_df[['tempo', 'popularity']]

# Standardizing the features to have a accurate baseline to compare songs to each other.
features_std = (features - features.mean()) / features.std()

# Calculating cosine similarity of highest tempo song with other songs
highest_tempo_track_index = song_info_df.index[song_info_df['track_id'] == highest_tempo_track['track_id']][0]
cos_sim_highest_tempo = cosine_similarity(features_std.iloc[highest_tempo_track_index, :].values.reshape(1, -1), features_std)

# Sorting cosine similarity df by row and getting the top 5 most similar tracks
top_5_similar_highest_tempo = song_info_df.loc[np.argsort(-cos_sim_highest_tempo[0])[:6], ['track_name', 'artist_name', 'tempo', 'popularity']]
top_5_similar_highest_tempo['similarity'] = cos_sim_highest_tempo[0][np.argsort(-cos_sim_highest_tempo[0])[:6]]

# Calculating the cosine similarity of lowest tempo song with other songs
lowest_tempo_track_index = song_info_df.index[song_info_df['track_id'] == lowest_tempo_track['track_id']][0]
cos_sim_lowest_tempo = cosine_similarity(features_std.iloc[lowest_tempo_track_index, :].values.reshape(1, -1), features_std)

# Sorting cosine similarity df by row and getting the top 5 most similar tracks
top_5_similar_lowest_tempo = song_info_df.loc[np.argsort(-cos_sim_lowest_tempo[0])[:6], ['track_name', 'artist_name', 'tempo', 'popularity']]
top_5_similar_lowest_tempo['similarity'] = cos_sim_lowest_tempo[0][np.argsort(-cos_sim_lowest_tempo[0])[:6]]

In [44]:
print("Top 5 songs most similar to the song with the highest tempo:")
top_5_similar_highest_tempo

Top 5 songs most similar to the song with the highest tempo:


Unnamed: 0,track_name,artist_name,tempo,popularity,similarity
6,Something in the Orange,Zach Bryan,175.212,88,1.0
54,No Se Va - En Vivo,Grupo Frontera,172.712,88,0.999992
48,Bebe Dame,"Fuerza Regida, Grupo Frontera",157.437,88,0.999174
19,Low,SZA,145.044,88,0.994001
52,Igualito a Mi Apá,Fuerza Regida,144.908,88,0.993868
93,snowfall,"Øneheart, reidenshi",95.014,88,0.99344


In [45]:
print("\nTop 5 songs most similar to the song with the lowest tempo:")
top_5_similar_lowest_tempo


Top 5 songs most similar to the song with the lowest tempo:


Unnamed: 0,track_name,artist_name,tempo,popularity,similarity
76,nuts,Lil Peep,62.007,83,1.0
97,P power (feat. Drake),Gunna,119.251,80,0.997777
44,Bad Habit,Steve Lacy,168.946,85,0.987882
89,Thank God,Kane Brown,99.945,79,0.987402
74,Whiskey On You,Nate Smith,110.024,80,0.959054
59,Midnight Rain,Taylor Swift,139.865,87,0.947241
