## INST414 | Assignment 3 - Module 3 | Daniel Hernandez Gonzalez

In [62]:
# Pandas, we all love them. 
import pandas as pd

#spotify tools.
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Tools for making calculations
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Authenticating with Spotify API

In [63]:
# Authentication keys, taken out from my final push.
client_id = '******'
client_secret = '******'
username = '*****'

# set up Spotify API credentials
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# Gathering data

In [90]:
# It is more efficient to make a function to do this set of instructions instead of writing it more than once. 
def get_songs(genre):
    # First set of 50 songs, search query and parameters
    query = f'genre:{genre}, year:2022'
    limit = 50
    offset = 0

    # Spotify API call for first 50 songs
    results1 = sp.search(q=query, type='track', limit=limit, offset=offset)

    # Second set of 50 songs, new offset
    offset = 50

    # Second API request
    results2 = sp.search(q=query, type='track', limit=limit, offset=offset)

    # third set of 50 songs, new offset
    offset = 100

    # Third API call
    results3 = sp.search(q=query, type='track', limit=limit, offset=offset)
    
    # Merging results and creating list of top 150 songs
    top_songs = results1['tracks']['items'] + results2['tracks']['items'] + results3['tracks']['items']
    
    return top_songs


In [91]:
# Gathering top 150 songs from 2022 for the following genres. 
# I chose specific genres based on the genres shown on Spotify's platform and a few that I am interested in. Genre page: https://open.spotify.com/genre/hub-browse-grid

genre_list = ['rap', 'pop', 'house', 'country', 'rock', 'soul', 'inde', 'punk', 'corrido', 'banda']

# Creating an empty DataFrame with columns.
songs_df = pd.DataFrame(columns=['track_id', 'track_name', 'artist_name', 'genre', 'tempo', 'popularity', 'acousticness', 'danceability', 'energy', 'loudness'])


In [92]:
# Iterating over the genres and tracks to add data into songs_df
for genre in genre_list:
    genre_songs = get_songs(genre)
    for track in genre_songs:
        track_id = track['id']
        artist_names = [artist['name'] for artist in track['album']['artists']]
        artist_name = ', '.join(artist_names)
        track_name = track['name']
        popularity = track['popularity']
        genre = genre

        # Spotify API call to get additional audio features
        audio_features = sp.audio_features(track_id)
        if audio_features:
            acousticness = audio_features[0]['acousticness']
            danceability = audio_features[0]['danceability']
            energy = audio_features[0]['energy']
            tempo = audio_features[0]['tempo']
            loudness = audio_features[0]['loudness']
        else:
            # Incase any missing values. Will drop after.
            acousticness, danceability, energy, tempo = None, None, None, None

        # adding all data to pulled into songs_df
        songs_df.loc[len(songs_df)] = [track_id, track_name, artist_name, genre, tempo, popularity, acousticness, danceability, energy, loudness]


In [93]:
# Dataframe size before cleaning
print(f'table size before cleaning: {songs_df.shape}') 

# Dropping duplicate track names if they have the same artist name
songs_df = songs_df.drop_duplicates(subset=['track_name', 'artist_name'])

# Dataframe size after cleaning and displaying table
print(f'\ntable size after cleaning: {songs_df.shape}\n') 
songs_df

table size before cleaning: (1500, 10)

table size after cleaning: (1323, 10)



Unnamed: 0,track_id,track_name,artist_name,genre,tempo,popularity,acousticness,danceability,energy,loudness
0,4FyesJzVpA39hbYvcseO2d,Just Wanna Rock,Lil Uzi Vert,rap,150.187,91,0.0652,0.486,0.545,-7.924
1,1Qrg8KqiBpW07V7PNxwwwL,Kill Bill,SZA,rap,88.980,95,0.0521,0.644,0.735,-5.747
2,2dHHgzDwk4BJdRwy9uXhTO,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,rap,97.950,96,0.4170,0.715,0.620,-6.005
3,1bDbXMyjaUIooNwFE9wn0N,Rich Flex,"Drake, 21 Savage",rap,153.150,91,0.0503,0.561,0.520,-9.342
4,0vjeOZ3Ft5jvAi9SBFJm1j,Superhero (Heroes & Villains) [with Future & C...,Metro Boomin,rap,116.622,90,0.1520,0.526,0.606,-5.300
...,...,...,...,...,...,...,...,...,...,...
1495,03KHME9Npqhqzm8ivhghLX,Nomás Imagina,Banda MS de Sergio Lizárraga,banda,118.231,51,0.4640,0.604,0.623,-6.495
1496,1uNo4WxrVKwGnSzyexALea,Hoy Te Pierdo,Alfredo Olivas,banda,179.975,60,0.0564,0.573,0.707,-6.748
1497,1dtpnvjay5bEhEuui09YWx,Cómo Me Haces Falta,"Luis Angel ""El Flaco""",banda,108.001,54,0.5000,0.924,0.505,-4.342
1498,58HNUZW1ffosnXPPHDwDRz,Abrigame - En Vivo,Carin Leon,banda,114.359,52,0.0523,0.666,0.700,-3.929


In [94]:
# Drops a row if it is missing a value.


print(f"{songs_df.shape}\n")

print(songs_df.isna().sum())
#songs_df.dropna(inplace=True)

#print(f"\n{songs_df.shape}")

# I was getting errors in pervious versions of my code. I kept this incase I wanted to come back and test different genres. 

(1323, 10)

track_id        0
track_name      0
artist_name     0
genre           0
tempo           0
popularity      0
acousticness    0
danceability    0
energy          0
loudness        0
dtype: int64


In [96]:
# pulling the top songs from each genre and sorting by popularity for better view.
top_songs_by_genre = songs_df.sort_values('popularity', ascending=False).groupby('genre').head(1)
top_songs_by_genre

Unnamed: 0,track_id,track_name,artist_name,genre,tempo,popularity,acousticness,danceability,energy,loudness
2,2dHHgzDwk4BJdRwy9uXhTO,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,rap,97.95,96,0.417,0.715,0.62,-6.005
153,0WtM2NBVQNNJLh6scP13H8,Calm Down (with Selena Gomez),"Rema, Selena Gomez",pop,106.999,96,0.382,0.801,0.806,-5.206
1200,1lRtH4FszTrwwlK5gTSbXO,AMG,"Natanael Cano, Peso Pluma, Gabito Ballesteros",corrido,136.175,95,0.152,0.772,0.73,-6.657
308,4zN21mbAuaD0WqtmaTZZeP,Ferrari,"James Hype, Miggy Dela Rosa",house,125.004,86,0.0127,0.847,0.69,-7.877
1351,5kpxVMDvYGRmNqawPqDTYR,JGL,"La Adictiva, Luis R Conriquez",banda,112.852,84,0.55,0.702,0.586,-5.317
612,3CIyK1V4JEJkg02E4EJnDl,Enemy (with JID) - from the series Arcane Leag...,Imagine Dragons,rock,77.011,83,0.237,0.728,0.783,-4.424
452,48UKTR66uUOT9LaUvooTNx,The Kind of Love We Make,Luke Combs,country,102.025,82,0.0226,0.608,0.745,-4.13
760,48CKzGWOC65n8a8kdANJV8,Only Love Can Hurt Like This - Slowed Down Ver...,"Paloma Faith, sped up + slowed",soul,82.012,74,0.102,0.554,0.854,-6.263
1054,2ah5gOCogw00A62XBoepmc,Miracle,A Day To Remember,punk,160.028,68,0.000266,0.463,0.917,-3.776
900,21rpBPAlXAvBavvu5kCQzs,After Midnight,Phoenix,inde,100.01,61,0.00245,0.618,0.811,-6.654


#### As a disclaimer, I am aware that this is alphabetically the top song per genre based on sorting by popularity. 

# Distance calculations

### My goal: 
    - Grab the top songs from each genre. 
    - Calculate Euclidean distance between songs.
    - print top 10 closest songs

In [97]:
def closest_tracks(track_id, songs_df):
    # Extracting audio features for the provided track
    audio_features = songs_df.loc[songs_df['track_id'] == track_id, ['tempo', 'acousticness', 'danceability', 'energy', 'loudness']]

    # Calculating Euclidean distance between the provided track and all other tracks
    songs_df.loc[:,'distance'] = np.sqrt((songs_df['tempo'] - audio_features['tempo'].iloc[0])**2 +
                                    (songs_df['danceability'] - audio_features['danceability'].iloc[0])**2 +
                                    (songs_df['acousticness'] - audio_features['acousticness'].iloc[0])**2 +
                                    (songs_df['energy'] - audio_features['energy'].iloc[0])**2 + 
                                    (songs_df['loudness'] - audio_features['loudness'].iloc[0])**2)

    # Sorting the DataFrame by distance in ascending order
    songs_df = songs_df.sort_values('distance')

    # Adding the input track back to the top of the DataFrame
    input_track = songs_df.loc[songs_df['track_id'] == track_id]

    # Adding closest tracks to new dataframe
    closest_tracks = pd.concat([input_track, songs_df.loc[((songs_df['track_name'] != input_track['track_name'].iloc[0]) |
                                        (songs_df['artist_name'] != input_track['artist_name'].iloc[0])) &
                                        (songs_df['track_id'] != input_track['track_id'].iloc[0])].head(10)])

    # Resetting the index and dropping old index to avoid too many numbers. 
    closest_tracks = closest_tracks.reset_index(drop=True)

    return closest_tracks

### What are the closest songs in comparison to the top rap song?

In [98]:
rap_id = '2dHHgzDwk4BJdRwy9uXhTO' # "'Creepin' (with The Weeknd & 21 Savage)"
closest_rap = closest_tracks(rap_id, songs_df)
closest_rap

Unnamed: 0,track_id,track_name,artist_name,genre,tempo,popularity,acousticness,danceability,energy,loudness,distance
0,2dHHgzDwk4BJdRwy9uXhTO,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,rap,97.95,96,0.417,0.715,0.62,-6.005,0.0
1,4ZUpTMKPYFTfJqamWFU9Zv,IDGAF (with blackbear),BoyWithUke,rap,97.975,76,0.427,0.782,0.728,-5.93,0.15001
2,4uk8GguFehn7djGvFJTUUT,Enloquecido,Grupo Arriesgado,corrido,98.053,71,0.25,0.784,0.734,-6.007,0.23719
3,5Ei8EK2uXNkqqVSe2QYsFp,Que Te Vaya Bien,Julión Álvarez y su Norteño Banda,banda,97.99,76,0.237,0.667,0.467,-5.906,0.263655
4,3bvJftZKZe5QKz433NczyV,No Es Por Acá,Carin Leon,corrido,97.568,82,0.456,0.746,0.37,-6.309,0.550747
5,6Awl7JBDQjfm7xkS3LkhDi,We Got History,Mitchell Tenpenny,country,98.025,74,0.0014,0.628,0.833,-6.327,0.578766
6,1VMBapbhHidf6ALFszT7w1,Lonely,Imagine Dragons,rock,97.983,73,0.0256,0.714,0.73,-5.564,0.60072
7,3G1nE9ZEv4eGEM48IAIv8u,Millennium,Toro y Moi,soul,98.055,51,0.325,0.687,0.691,-6.743,0.754956
8,1zsPaEkglFvxjAhrM8yhpr,Hey Mor,Ozuna,pop,98.002,92,0.00302,0.901,0.589,-6.713,0.843151
9,6Rpbzk2LC8xWZI93RyodAq,Hey Mor,"Ozuna, Feid",pop,98.002,84,0.00302,0.901,0.589,-6.713,0.843151


### What are the closest songs in comparison to the top corrido song?

In [99]:
corrido_id = '1lRtH4FszTrwwlK5gTSbXO' # "AMG"
closest_corrido = closest_tracks(corrido_id, songs_df)
closest_corrido

Unnamed: 0,track_id,track_name,artist_name,genre,tempo,popularity,acousticness,danceability,energy,loudness,distance
0,1lRtH4FszTrwwlK5gTSbXO,AMG,"Natanael Cano, Peso Pluma, Gabito Ballesteros",corrido,136.175,95,0.152,0.772,0.73,-6.657,0.0
1,3gfSH9aYZbyGjdmbeoWliw,Siempre Pendientes,"Peso Pluma, Luis R Conriquez",corrido,135.929,87,0.331,0.774,0.748,-7.086,0.526238
2,5gVCfYmQRPy1QJifP8f5gg,Fall In Love,Bailey Zimmerman,pop,135.962,80,0.451,0.524,0.643,-6.055,0.752494
3,1wLNEMiUzwvRZz9XHCXhAE,Fifth of May,Zach Bryan,country,136.728,72,0.391,0.596,0.544,-7.039,0.757909
4,1ENsFKcyksWFyXNQlWCI1c,Mirate Nomas (En Vivo),Ulices Chaidez,banda,135.836,55,0.582,0.653,0.375,-7.06,0.776155
5,6AhwAWzSlISc5ZvGonkgdN,Playing God,Polyphia,rock,137.06,67,0.0395,0.601,0.789,-6.822,0.92511
6,5VfkfKTTo5BRKqHwfENF0V,Míranos Ahora,Calibre 50,corrido,135.832,70,0.651,0.68,0.439,-5.909,1.009603
7,46FdwGMcgz6gik4RCpWBAW,Los Collares,"Oscar Maydon, El Padrinito Toys",corrido,136.724,86,0.208,0.645,0.712,-5.787,1.038215
8,45bfH0GZvUyujIBiKRhXso,B.O.T.A. (Baddest Of Them All),"Eliza Rose, Interplanetary Criminal",house,136.981,67,0.0671,0.756,0.962,-7.526,1.21082
9,39JofJHEtg8I4fSyo7Imft,B.O.T.A. (Baddest Of Them All) - Edit,"Eliza Rose, Interplanetary Criminal",house,137.001,82,0.164,0.736,0.906,-7.589,1.258299


### What are the closest songs in comparison to the top Banda song?

In [100]:
banda_id = '5kpxVMDvYGRmNqawPqDTYR' # "JGL"
closest_banda = closest_tracks(banda_id, songs_df)
closest_banda

Unnamed: 0,track_id,track_name,artist_name,genre,tempo,popularity,acousticness,danceability,energy,loudness,distance
0,5kpxVMDvYGRmNqawPqDTYR,JGL,"La Adictiva, Luis R Conriquez",banda,112.852,84,0.55,0.702,0.586,-5.317,0.0
1,0RstZCli3DcyY1LJq38P4b,El Chaman,Gabito Ballesteros,corrido,112.998,69,0.26,0.814,0.575,-5.348,0.345025
2,42cVZAhX13JCHwrvlfqLcB,When You Lose Someone,The Dip,soul,112.452,53,0.365,0.751,0.498,-5.766,0.637159
3,1xTYy3K3hiTcoZfoD7culn,Say You Want Me - Single Version,Masego,soul,112.978,63,0.19,0.884,0.658,-5.94,0.756249
4,6DL44bNYvHzr1E58BAx5zx,Horseshit on Route 66,The Garden,punk,112.328,49,0.0283,0.498,0.978,-5.115,0.884777
5,66MvNxKQGPQS0AUeaoYlcn,hair out,Stand Atlantic,punk,112.987,53,0.000488,0.678,0.896,-4.685,0.903487
6,5Tg3TbTftDrjmQORfsqw5m,Enamorado Y Feliz,La Adictiva,banda,113.59,50,0.377,0.653,0.434,-5.895,0.966521
7,5DAkzBJ48N7z6lwY4eZ0PP,Artefact,Phoenix,inde,112.025,53,0.00784,0.705,0.636,-5.734,1.074367
8,11kskoMyNDumaR30CI6rRv,Half Of Me,Thomas Rhett,country,111.995,69,0.00683,0.575,0.842,-5.058,1.085462
9,69w5X6uTrOaWM32IetSzvO,Daydreaming,Harry Styles,pop,113.971,77,0.311,0.707,0.811,-5.298,1.166316
