In [1]:
# imports
import os
import time
import numpy as np
import pandas as pd
import json
import spotipy
import yt_dlp

from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
# data loading
data_json = pd.read_json('song-ids.json')
data_json = pd.DataFrame(data_json)
data_json = data_json.rename(columns={0: 'song_id'})
data_json

Unnamed: 0,song_id
0,0pn7SynQPZBgNMlqPitrR6
1,1JsLA3JWc4XE2StbJcRYG6
2,23ssBTLuRpiE8hlCTEZCIk
3,6BPJCWBiFHF7ItUQlyyP6x
4,6GTVtjlDMEtnM2paZZ6VNe
...,...
24994,6Z1fY3aEN6dWMbLFSNCDYV
24995,2ljc7AncHJ7lZ7uTxALWLZ
24996,0B6g379TMw0nYGXA7y5xHz
24997,0VhkA3NAwIFfVW4NFCbAOD


In [3]:
# create a copy of the original dataframe (for appending song names)
data_json_copy = data_json
data_json_copy

Unnamed: 0,song_id
0,0pn7SynQPZBgNMlqPitrR6
1,1JsLA3JWc4XE2StbJcRYG6
2,23ssBTLuRpiE8hlCTEZCIk
3,6BPJCWBiFHF7ItUQlyyP6x
4,6GTVtjlDMEtnM2paZZ6VNe
...,...
24994,6Z1fY3aEN6dWMbLFSNCDYV
24995,2ljc7AncHJ7lZ7uTxALWLZ
24996,0B6g379TMw0nYGXA7y5xHz
24997,0VhkA3NAwIFfVW4NFCbAOD


In [4]:
# api creds
client_id = ''
client_secret = ''

In [5]:
# initializing api client
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [6]:
# song ids
song_ids = data_json['song_id'].tolist()
song_ids = song_ids[10000:15000]
print(song_ids)

['11XLFFUNBk3vMKgblUAy1z', '7yip9pscRYhAwbZ4wfwQJx', '7556LMF1GR5QawF5PHLjI7', '68R32667rUVTo8ldDkOXyR', '4AxG1T6UJJ4xCX4Wz6JDLG', '5BKDqwrywwngFZyLpRdPhq', '0lZZ8pibk6Zu3M9hTthk1a', '6pdDXBRgRpcyGXcXJthNYC', '1PWVebG0tcyLyQYikv8JQS', '174gRALmmLbxeY4eUo2zCc', '0WJPw0kxOuBLjSAfXLLrSZ', '4GxdbaMtzN3jKJdjHpM3ET', '0i1RAruE7q6NJdoV3XKJk8', '6sT9MWlJManry3EQwf4V80', '4jkNGjbn7LZpgpRSOiJqCp', '42iXkSBfgzE7YyvKGZ7NzV', '2fSW54bA6x2iWePU7rpCiP', '2w1xQoej5TcciLKzx3DNzS', '7aCnhg8VU5XgUrgKCf9bET', '4DEY52zyUAQq4D5xeYUn4M', '3lCIV1noFOuHCtOCAeiBtN', '3S5AYyv0RYxNbUotueY32I', '4KjBDZps02XJYgQs9i6aUn', '73RbfOTJIjHzi2pcVHjeHM', '39BOpF8KvdMqUZlJyfnV34', '4OOLOIpYdw9oBSkdiHolgf', '0Pvrgc89C88HqJ8QRuukKh', '6mP31FqR4ZJzyrb4wVYuM5', '4LKfJMumpW7HyBT05uXYUx', '4LopAeo0EFicmQyh9lClkj', '6cstGFtvr32MNiv15foFlF', '1QQgmN383kUqjioRoTSfF3', '4J3MsRx4qGqmgtVCAA7f9R', '2YIDnkBfwbLMkt9rcugLdr', '0cwrUbIAPi1GNLrdIOT5Sd', '5hc71nKsUgtwQ3z52KEKQk', '0dTjGvzVRsvUjM9jJVWEDK', '2JKarN2L9zu2RaMx7gkkZ4', '3h1ipSBbuf

In [7]:
# create download directory
download_dir = f"song_ids_downloads"
os.makedirs(download_dir, exist_ok=True)

In [8]:
# yt track download options
ydl_opts = {
    'outtmpl': f"{download_dir}/%(title)s.%(ext)s",
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
}

In [None]:
# getting track details for entire list and downloading from youtube
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    for song_id in song_ids:
        track_info = sp.track(song_id)
        title = track_info['name']
        artist = track_info['artists'][0]['name']
        
        # start indexing from 1
        print(f"Downloading {song_ids.index(song_id)} of {len(song_ids)}")
        print(f"Downloading: {title} - {artist}")
        query = f"{title} {artist}"

        try:
            ydl.download([f"ytsearch:{query}"])
            info_query = ydl.extract_info(f"ytsearch:{query}", download=False)

            # get url of the first search result
            url = info_query['entries'][0]['webpage_url']

            # get title of the downloaded video and store in video_title variable
            info_url = ydl.extract_info(url, download=False)
            video_title = info_url.get('title')

            print(f"Downloaded: {title} - {artist}")

            # save title to a new column in the dataframe
            data_json_copy.at[song_ids.index(song_id)+10000, 'song_name'] = f"{video_title}.mp3"

        except yt_dlp.utils.DownloadError as e:
            print(f"Error downloading {title} - {artist}: {e}")
            data_json_copy.at[song_ids.index(song_id)+10000, 'song_name'] = f"NaN"

        # 10s timeout to avoid getting rate limited
        time.sleep(10)

In [None]:
# updated dataframe
data_json_copy

In [None]:
# save the new dataframe as a csv
data_json_copy.to_csv('data_json_copy3.csv', index=False)