In [1]:
# imports
import os
import time
import numpy as np
import pandas as pd
import json
import spotipy
import yt_dlp

from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
# data loading
data_json = pd.read_json('song-ids.json')
data_json = pd.DataFrame(data_json)
data_json = data_json.rename(columns={0: 'song_id'})
data_json

Unnamed: 0,song_id
0,0pn7SynQPZBgNMlqPitrR6
1,1JsLA3JWc4XE2StbJcRYG6
2,23ssBTLuRpiE8hlCTEZCIk
3,6BPJCWBiFHF7ItUQlyyP6x
4,6GTVtjlDMEtnM2paZZ6VNe
...,...
24994,6Z1fY3aEN6dWMbLFSNCDYV
24995,2ljc7AncHJ7lZ7uTxALWLZ
24996,0B6g379TMw0nYGXA7y5xHz
24997,0VhkA3NAwIFfVW4NFCbAOD


In [3]:
# create a copy of the original dataframe (for appending song names)
data_json_copy = data_json
data_json_copy

Unnamed: 0,song_id
0,0pn7SynQPZBgNMlqPitrR6
1,1JsLA3JWc4XE2StbJcRYG6
2,23ssBTLuRpiE8hlCTEZCIk
3,6BPJCWBiFHF7ItUQlyyP6x
4,6GTVtjlDMEtnM2paZZ6VNe
...,...
24994,6Z1fY3aEN6dWMbLFSNCDYV
24995,2ljc7AncHJ7lZ7uTxALWLZ
24996,0B6g379TMw0nYGXA7y5xHz
24997,0VhkA3NAwIFfVW4NFCbAOD


In [4]:
# api creds
client_id = ''
client_secret = ''

In [5]:
# initializing api client
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [6]:
# song ids
song_ids = data_json['song_id'].tolist()
song_ids = song_ids[5000:10000]
print(song_ids)

['41mzryVlFLlz6b1BCtJAjx', '3uYVRnAtTl46yWcNDh4YSV', '6qQGxKyy6LcyZVsWn93lyS', '4fc8S2UwmZrcXmZ98Aqk5l', '2IZQG2wYoVncO3hWCD4IyM', '4gezEvRfdBCNd9ACoTj8Gv', '0tYtYAjDK6KNk6aoVhzOuK', '6hfuWCUmHTVIcjdnowvvUa', '3MlEryrxCKZkcfX18ZLX96', '4k0VOhFzawpzyopWO1B9bF', '1CBlW74FNbeN3iwFuEtrcC', '0wbnC9AUenxp613TYaJsGK', '1YMRjMUPjaeY7BiEfFrfS1', '0AUyNF6iFxMNQsNx2nhtrw', '4jNewYH1oB7WbyQL0tfxWr', '4YGoYeshftVaDxk4Cl9pYa', '59mMrwHD7rnoSI4YtXh7J1', '3DplsrN6Ik8s5Ueoygtnsu', '0iAtcqF4t6HVT9KGGmXyM7', '2ziRg1123X0nrbUGvWQGwr', '4i76vmvahz5a4MhinU59sI', '28mv40MzspRZn0PBcO2itT', '1Jy43R3ZGQVO6PgWjMV4oE', '5agF3UjBMa5odD5Vn0vWMI', '4CVhywni3QwalKLZgwFi58', '4I58RPHVzDldHfK3t5HiZA', '2xebCnKCjYWBhJ4Ai8hO5E', '3yKWUJSbDaJrgJWSu0ebAs', '7vxR5LgbW7a8meQKe4LJKf', '5p5tCyTnSWUuZgsKkKRcka', '6WbDwAoc5xr0kFflHZW0A4', '3c0CJ0DBgsGZ7I79lyx02Z', '4FZmjzFP31JGVPnpWanMEo', '0z8yrlXSjnI29Rv30RssNI', '1S9EzFjZgwyc5MGWptK4Fa', '03zjj8Ni3NQqInP6FQ25v2', '5xC8uOesnn0udeXAYlAnoY', '3uk4mNkMfOI5lIQC56KcFJ', '2oiixB9QMI

In [7]:
# create download directory
download_dir = f"song_ids_downloads"
os.makedirs(download_dir, exist_ok=True)

In [8]:
# yt track download options
ydl_opts = {
    'outtmpl': f"{download_dir}/%(title)s.%(ext)s",
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
}

In [None]:
# getting track details for entire list and downloading from youtube
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    for song_id in song_ids:
        track_info = sp.track(song_id)
        title = track_info['name']
        artist = track_info['artists'][0]['name']
        
        # start indexing from 1
        print(f"Downloading {song_ids.index(song_id)} of {len(song_ids)}")
        print(f"Downloading: {title} - {artist}")
        query = f"{title} {artist}"

        try:
            ydl.download([f"ytsearch:{query}"])
            info_query = ydl.extract_info(f"ytsearch:{query}", download=False)

            # get url of the first search result
            url = info_query['entries'][0]['webpage_url']

            # get title of the downloaded video and store in video_title variable
            info_url = ydl.extract_info(url, download=False)
            video_title = info_url.get('title')

            print(f"Downloaded: {title} - {artist}")

            # save title to a new column in the dataframe
            data_json_copy.at[song_ids.index(song_id)+5000, 'song_name'] = f"{video_title}.mp3"

        except yt_dlp.utils.DownloadError as e:
            print(f"Error downloading {title} - {artist}: {e}")
            data_json_copy.at[song_ids.index(song_id)+5000, 'song_name'] = f"NaN"

        # 10s timeout to avoid getting rate limited
        time.sleep(10)

In [None]:
# updated dataframe
data_json_copy

In [None]:
# save the new dataframe as a csv
data_json_copy.to_csv('data_json_copy2.csv', index=False)