In [1]:
# imports
import os
import time
import numpy as np
import pandas as pd
import json
import spotipy
import yt_dlp

from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
# data loading
data_json = pd.read_json('song-ids.json')
data_json = pd.DataFrame(data_json)
data_json = data_json.rename(columns={0: 'song_id'})
data_json

Unnamed: 0,song_id
0,0pn7SynQPZBgNMlqPitrR6
1,1JsLA3JWc4XE2StbJcRYG6
2,23ssBTLuRpiE8hlCTEZCIk
3,6BPJCWBiFHF7ItUQlyyP6x
4,6GTVtjlDMEtnM2paZZ6VNe
...,...
24994,6Z1fY3aEN6dWMbLFSNCDYV
24995,2ljc7AncHJ7lZ7uTxALWLZ
24996,0B6g379TMw0nYGXA7y5xHz
24997,0VhkA3NAwIFfVW4NFCbAOD


In [3]:
# create a copy of the original dataframe (for appending song names)
data_json_copy = data_json
data_json_copy

Unnamed: 0,song_id
0,0pn7SynQPZBgNMlqPitrR6
1,1JsLA3JWc4XE2StbJcRYG6
2,23ssBTLuRpiE8hlCTEZCIk
3,6BPJCWBiFHF7ItUQlyyP6x
4,6GTVtjlDMEtnM2paZZ6VNe
...,...
24994,6Z1fY3aEN6dWMbLFSNCDYV
24995,2ljc7AncHJ7lZ7uTxALWLZ
24996,0B6g379TMw0nYGXA7y5xHz
24997,0VhkA3NAwIFfVW4NFCbAOD


In [4]:
# api creds
client_id = ''
client_secret = ''

In [5]:
# initializing api client
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [6]:
# song ids
song_ids = data_json['song_id'].tolist()
song_ids = song_ids[:5000]
print(song_ids)

['0pn7SynQPZBgNMlqPitrR6', '1JsLA3JWc4XE2StbJcRYG6', '23ssBTLuRpiE8hlCTEZCIk', '6BPJCWBiFHF7ItUQlyyP6x', '6GTVtjlDMEtnM2paZZ6VNe', '0qlCTm0U6Jcgx29HEYIXrg', '4a0sqvCBo7aRIS4GzBQpnp', '35QiTFf7gjbYmZ0Sj7dd3n', '2HEt32GOG4LhdkAcwkVDPD', '4kiwnJJ6ThoXzEean8UI1f', '4PCOSYkwl0aWdal9iZP1Oy', '1R7TwnwF1RuYNf5vXlpQUT', '2V65y3PX4DkRhy1djlxd9p', '6gTbAJOvivFXwkOEhSIOtS', '7gohMujUKNwS2ocu7I7yVW', '7n8lRa37iDNelsWw8JmoX4', '7vrJn5hDSXRmdXoR30KgF1', '3U70BInxXP4XBO7Bhyty7X', '6eCNz55YybdERmd840hKZa', '0IKc0xOUToF1PanrCgDUs1', '21hfiYaPDN8E14Trp4qTF3', '7jJ3EWr0umPsjC6CEupXRK', '4dHdqpGaO9km4TEw9nNQSR', '4DkjZPCTvs3W2VVyDi4MAt', '7pbg3ABlAZv2NiIdKbBBFm', '2wmrHfBFcCCYVdrlrUrJw3', '0OUQhRKgGSb0PPrLUuEQka', '49dFv4gH1SeY90FySDOwCE', '1vN8k0BhCIUrjsWCeAMpOW', '04aAxqtGp5pv12UXAg4pkq', '6Ec5LeRzkisa5KJtwLfOoW', '0FjBNvLsCCNu7vmCh8mZns', '7pG2i1QLokufVdQTbHsM0c', '0USw1cIvmZPOoU6pBGKWLR', '7A9EcBrwTG6bwswUy7beWW', '3CnK206lwL8j2ETlzBC9go', '2Tj9aHtf7g8ydYqsB6S9M5', '55Am8neGJkdj2ADaM3aw5H', '4aVQevcrgV

In [7]:
# create download directory
download_dir = f"song_ids_downloads"
os.makedirs(download_dir, exist_ok=True)

In [8]:
# yt track download options
ydl_opts = {
    'outtmpl': f"{download_dir}/%(title)s.%(ext)s",
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
}

In [None]:
# getting track details for entire list and downloading from youtube
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    for song_id in song_ids:
        track_info = sp.track(song_id)
        title = track_info['name']
        artist = track_info['artists'][0]['name']
        
        # start indexing from 1
        print(f"Downloading {song_ids.index(song_id)} of {len(song_ids)}")
        print(f"Downloading: {title} - {artist}")
        query = f"{title} {artist}"

        try:
            ydl.download([f"ytsearch:{query}"])
            info_query = ydl.extract_info(f"ytsearch:{query}", download=False)

            # get url of the first search result
            url = info_query['entries'][0]['webpage_url']

            # get title of the downloaded video and store in video_title variable
            info_url = ydl.extract_info(url, download=False)
            video_title = info_url.get('title')

            print(f"Downloaded: {title} - {artist}")

            # save title to a new column in the dataframe
            data_json_copy.at[song_ids.index(song_id), 'song_name'] = f"{video_title}.mp3"

        except yt_dlp.utils.DownloadError as e:
            print(f"Error downloading {title} - {artist}: {e}")
            data_json_copy.at[song_ids.index(song_id), 'song_name'] = f"NaN"

        # 10s timeout to avoid getting rate limited
        time.sleep(10)

In [None]:
# updated dataframe
data_json_copy

In [None]:
# save the new dataframe as a csv
data_json_copy.to_csv('data_json_copy1.csv', index=False)