In [1]:
#!conda install -c conda-forge spotipy -y

In [2]:
import sys
from config import *

In [3]:
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials

#Initialize SpotiPy with user credentials 
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=Client_ID,
                                                           client_secret=Client_Secret))

In [4]:
import pandas as pd
import numpy as np

In [5]:
sp._session.timeout = 10

In [6]:
# We define the function to get the song id, based on the title and artist:
import pandas as pd
def search_song(title:str, artist:str=None, lim: int = 5):
    if artist:
        query = f"track:{title} artist:{artist}"
    else:
        query = f"track:{title}"
    try:
        results = sp.search(q=query)
        tracks = results['tracks']['items']
        if not tracks:
            print("Song", title, "from", artist, "not found!")
            return pd.DataFrame()
        
        # Extract relevant information from each track
        records = []
        for track in tracks[:lim]:
            record = {
                'title': track['name'],
                'artist': ', '.join([artist['name'] for artist in track['artists']]),
                'id': track['id']
            }
            records.append(record)
            
        # Create DataFrame from the list of records
        df = pd.DataFrame(records)
        return df
    
    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()

In [None]:
#testing the search song function
search_song("imagine",'john lennon', 1)

In [None]:
# loading the dataframe with the hot 100 songs
hot_songs_09_jan = pd.read_csv('hot_songs_09_jan.csv')

In [None]:
for index, row in hot_songs_09_jan.iterrows():
    result_df=search_song(row['title'], row['artist'], 1)
    if not result_df.empty:
        hot_songs_09_jan.at[index, 'id'] = result_df['id'].values[0]
    else:
        hot_songs_09_jan.at[index, 'id'] = ""

In [None]:
display(hot_songs_09_jan)

In [None]:
hot_songs_09_jan=hot_songs_09_jan[hot_songs_09_jan["id"]!=""]
display(hot_songs_09_jan.head())
hot_songs_09_jan.shape

In [None]:
# I load the not_hot songs df
not_hot= pd.read_csv('final_df_not_hot.csv')

In [None]:
# Show the shape and the head:
display(not_hot.shape)
not_hot.head()

In [None]:
# I apply the search_song to create an extra column into the not_hot df and obtain the id.
# we print those songs for which no id was found.
for index, row in not_hot.iterrows():
    result_df=search_song(row['title'], row['artist'], 1)
    if not result_df.empty:
        not_hot.at[index, 'id'] = result_df['id'].values[0]
    else:
        not_hot.at[index, 'id'] = ""

In [None]:
# We filter out the records with empty id
not_hot = not_hot[not_hot['id']!= ""]
display(not_hot.head(10))
not_hot.shape

In [None]:
# We define the function to retrieve the audio features for the different chunks we have created before
import pandas as pd
import time

def get_audio_features_for_chunks(sp, list_of_song_ids, chunk_size=50, sleep_time=20):

    # Split the list_of_song_ids into chunks of size chunk_size
    song_id_chunks = [list_of_song_ids[i:i + chunk_size] for i in range(0, len(list_of_song_ids), chunk_size)]

    # Create an empty DataFrame to store the audio features
    df_audio_features = pd.DataFrame()

    # Iterate through each chunk
    for chunk in song_id_chunks:
        print("Collecting audio features for chunk...")
        time.sleep(sleep_time) 
        my_dict = sp.audio_features(chunk)

        # Check if my_dict is not None and contains elements before creating a DataFrame
        if my_dict and isinstance(my_dict, list) and len(my_dict) > 0:
            # Create a new dictionary with a more structured format
            my_dict_new = {key: [item[key] for item in my_dict] for key in my_dict[0]}

            # Create a DataFrame from the audio features and append it to df_audio_features
            df_chunk = pd.DataFrame(my_dict_new)
            df_audio_features = pd.concat([df_audio_features, df_chunk], ignore_index=True)

    return df_audio_features

In [None]:
# get list with all the ids from hot_song df
hot_song_id_list = hot_songs_09_jan["id"].values.tolist()
display(hot_song_id_list)

In [None]:
# we get the audio features for the hot_songs df:
hot_songs_feat = get_audio_features_for_chunks(sp, hot_song_id_list, chunk_size=50, sleep_time=20)

In [None]:
display(hot_songs_feat.shape)
hot_songs_feat.head()

In [None]:
# we get the list of song id for the not_hot df:
not_hot_song_id_list = not_hot["id"].tolist()
not_hot_song_id_list

In [None]:
# we get the audio features for the not_hot df via the list of songs obtained on the previous step:
not_hot_songs_feat = get_audio_features_for_chunks(sp, not_hot_song_id_list, chunk_size=50, sleep_time=20)

In [None]:
not_hot_songs_feat.head()
display(not_hot_songs_feat.shape)

In [None]:
def merge_and_remove_duplicates(df, audio_features_df, merge_column='id'):
    """
    Merge a given DataFrame with the audio features DataFrame based on a specified column and remove all duplicates.

    Parameters:
    - df: Original DataFrame
    - audio_features_df: DataFrame containing audio features
    - merge_column: Column to merge on (default is 'id')

    Returns:
    - Merged and de-duplicated DataFrame
    """
   
    # Merge DataFrames
    merged_df = pd.merge(df, audio_features_df, on=merge_column, how='inner')


    # Remove all duplicates from the merged DataFrame
    merged_df = merged_df.drop_duplicates()

    return merged_df

In [None]:
hot_songs_audio = merge_and_remove_duplicates(hot_songs_09_jan, hot_songs_feat, merge_column='id')

In [None]:
display(hot_songs_audio.shape)
hot_songs_audio.head()

In [None]:
not_hot_songs_audio = merge_and_remove_duplicates(not_hot, not_hot_songs_feat, merge_column='id')

In [None]:
display(not_hot_songs_audio.shape)
not_hot_songs_audio.head()

In [None]:
hot_songs_audio.to_csv('hot_songs_audio_features.csv',sep=';', index=False)

In [None]:
not_hot_songs_audio.to_csv('not_hot_songs_audio_features.csv',sep=';', index=False)