In [1]:
import os
import json
from tqdm.notebook import tqdm
from ast import literal_eval
import pandas as pd
from glob import glob

PATH_YOUTUBE_DATA = "../dataset/Dataset40k/youtube_data"
PATH_SPOTIFY_YOUTUBE_CSV = "../dataset/Dataset40k/spotify_with_youtube.csv"
PATH_SAVE_FINAL_CSV = "../dataset/Dataset40k/spotify_with_youtube_clean.csv"

assert os.path.exists(PATH_YOUTUBE_DATA) and os.path.isdir(PATH_YOUTUBE_DATA), \
    f"Received bad path `{PATH_YOUTUBE_DATA}`"
assert os.path.exists(PATH_SPOTIFY_YOUTUBE_CSV), f"Received bad path `{PATH_SPOTIFY_YOUTUBE_CSV}`"


# Load Spotify and YouTube csv

In [2]:
# Setup
df = pd.read_csv(PATH_SPOTIFY_YOUTUBE_CSV)
assert not df["track_id"].duplicated().any()
assert not df.isna().any().any()
to_list_columns = ["artists", "track_genre"]
for column in to_list_columns:
    df[column] = df[column].apply(literal_eval)

spotify_mapper = {
    'track_id': 'spotify_track_id',
    'track_name': 'spotify_track_name',
    'artists': 'spotify_artists',
    'album_name': 'spotify_album_name',
    'track_genre': 'spotify_genre',
    'popularity': 'spotify_popularity',
    'energy': 'spotify_energy',
    'valence': 'spotify_valence',
    'danceability': 'spotify_danceability',
    'tempo': 'spotify_tempo',
    'duration_ms': 'spotify_duration_ms',
    'key': 'spotify_key',
    'mode': 'spotify_mode',
    'loudness': 'spotify_loudness',
    'time_signature': 'spotify_time_signature',
    'acousticness': 'spotify_acousticness',
    'instrumentalness': 'spotify_instrumentalness',
    'speechiness': 'spotify_speechiness',
    'liveness': 'spotify_liveness',
    'explicit': 'spotify_explicit_content',
    'original_index': 'spotify_original_index',
}

df = df.rename(columns=spotify_mapper)
df.head()

Unnamed: 0,spotify_track_id,spotify_artists,spotify_album_name,spotify_track_name,spotify_popularity,spotify_duration_ms,spotify_explicit_content,spotify_danceability,spotify_energy,spotify_key,...,spotify_liveness,spotify_valence,spotify_tempo,spotify_time_signature,spotify_genre,youtube_search_query,spotify_original_index,youtube_url,youtube_title,youtube_video_skipped
0,3nqQXoyQOWXiESFLlDF1hG,"[Sam Smith, Kim Petras]",Unholy (feat. Kim Petras),Unholy (feat. Kim Petras),100,156943,False,0.714,0.472,2,...,0.266,0.238,131.121,4,[dance],"Unholy (feat. Kim Petras) - Sam Smith, Kim Pet...",20001,https://www.youtube.com/watch?v=Uq9gPaIzbe8&pp...,"Sam Smith, Kim Petras - Unholy (Official Music...",False
1,2tTmW7RDtMQtBk7m2rYeSw,"[Bizarrap, Quevedo]","Quevedo: Bzrp Music Sessions, Vol. 52","Quevedo: Bzrp Music Sessions, Vol. 52",99,198937,False,0.621,0.782,2,...,0.23,0.55,128.033,4,[hip-hop],"Quevedo: Bzrp Music Sessions, Vol. 52 - Bizarr...",51664,https://www.youtube.com/watch?v=A_g3lMcWVy0&pp...,QUEVEDO || BZRP Music Sessions #52,False
2,4h9wh7iOZ0GGn8QVp4RAOB,[OneRepublic],I Ain’t Worried (Music From The Motion Picture...,I Ain't Worried,96,148485,False,0.704,0.797,0,...,0.0546,0.825,139.994,4,[piano],"I Ain't Worried - OneRepublic, Official, music...",79000,https://www.youtube.com/watch?v=mNEUkkoUoIA&pp...,OneRepublic - I Ain’t Worried (From “Top Gun: ...,False
3,4LRPiXqCikLlN15c3yImP7,[Harry Styles],As It Was,As It Was,95,167303,False,0.52,0.731,6,...,0.311,0.662,173.93,4,[pop],"As It Was - Harry Styles, Official, music video",81052,https://www.youtube.com/watch?v=H5v3kku4y6Q&pp...,Harry Styles - As It Was (Official Video),False
4,6xGruZOHLs39ZbVccQTuPZ,[Joji],Glimpse of Us,Glimpse of Us,94,233456,False,0.44,0.317,8,...,0.141,0.268,169.914,3,[pop],"Glimpse of Us - Joji, Official, music video",81102,https://www.youtube.com/watch?v=NgsWGfUlwJI&pp...,Joji - Glimpse of Us (Official Video),False


# Remove rows without YouTube metadata

In [3]:
audio_paths = glob(f"{PATH_YOUTUBE_DATA}/*.mp3")
audio_paths_base_names = [os.path.basename(p)[:-4] for p in audio_paths]
has_youtube_data = df["spotify_track_id"].isin(audio_paths_base_names)
print(len(df))
df = df[has_youtube_data]
print(len(df))

49115
37453


# Append YouTube metadata
NOTE: Will take a couple of minutes

In [None]:
paths = glob(f"{PATH_YOUTUBE_DATA}/*")
for i, row in tqdm(df.iterrows(), total=len(df)):
    track_id = row["spotify_track_id"]
    track_paths = [p.replace("\\", "/").replace("//", "/") for p in paths if (track_id in p)]
    if not all(os.path.exists(p) for p in track_paths) or (len(track_paths) != 3):
        print(i, end=",")
        df = df.drop(index=i)
        continue

    # Json
    json_path = track_paths[0]
    assert json_path.endswith(".json")
    assert os.path.exists(json_path), f"Received bad path `{json_path}`"
    with open(json_path, 'r') as f:
        json_data = json.load(f)
    df.loc[i, "youtube_title_download"] = str(json_data["title"])
    df.loc[i, "youtube_duration_ms"] = int(json_data["length_ms"])
    df.loc[i, "youtube_views"] = int(json_data["views"])
    df.loc[i, "youtube_english_captions"] = str(json_data["captions_english"])

    # Audio
    audio_path = track_paths[1] 
    assert audio_path.endswith(".mp3")
    assert os.path.exists(audio_path), f"Received bad path `{audio_path}`"
    df.loc[i, "path_audio_full"] = audio_path

    # Thumbnail
    thumbnail_path = track_paths[2]
    assert thumbnail_path.endswith(".jpg")
    assert os.path.exists(thumbnail_path), f"Received bad path `{thumbnail_path}`"

# Wrapping up and save new csv

In [5]:
reorder = [
    'spotify_track_id', 'spotify_track_name', 'spotify_artists',
    'spotify_album_name', 'spotify_genre', 'spotify_popularity',
    'spotify_energy', 'spotify_valence', 'spotify_danceability',
    'spotify_tempo', 'spotify_duration_ms', 'spotify_key', 'spotify_mode',
    'spotify_loudness', 'spotify_time_signature', 'spotify_acousticness',
    'spotify_instrumentalness', 'spotify_speechiness', 'spotify_liveness',
    'spotify_explicit_content', 'spotify_original_index', 'youtube_url',
    'youtube_title', 'youtube_views', 'youtube_duration_ms',
    'youtube_video_skipped', 'youtube_search_query',
    'youtube_title_download', 'youtube_english_captions', "path_audio_full"
]
assert set(reorder) - set(df.columns.tolist()) == set(df.columns.tolist()) - set(reorder) == set()
df = df[reorder]
df = df.reset_index(drop=True)
df.to_csv(PATH_SAVE_FINAL_CSV, index=False)
df.head()

Unnamed: 0,spotify_track_id,spotify_track_name,spotify_artists,spotify_album_name,spotify_genre,spotify_popularity,spotify_energy,spotify_valence,spotify_danceability,spotify_tempo,...,spotify_original_index,youtube_url,youtube_title,youtube_views,youtube_duration_ms,youtube_video_skipped,youtube_search_query,youtube_title_download,youtube_english_captions,path_audio_full
0,3nqQXoyQOWXiESFLlDF1hG,Unholy (feat. Kim Petras),"[Sam Smith, Kim Petras]",Unholy (feat. Kim Petras),[dance],100,0.472,0.238,0.714,131.121,...,20001,https://www.youtube.com/watch?v=Uq9gPaIzbe8&pp...,"Sam Smith, Kim Petras - Unholy (Official Music...",258860226.0,275000.0,False,"Unholy (feat. Kim Petras) - Sam Smith, Kim Pet...",Unholy (Official Music Video),"[{'tStartMs': 1950, 'dDurationMs': 1943, 'segs...",../dataset/Dataset40k/youtube_data/3nqQXoyQOWX...
1,2tTmW7RDtMQtBk7m2rYeSw,"Quevedo: Bzrp Music Sessions, Vol. 52","[Bizarrap, Quevedo]","Quevedo: Bzrp Music Sessions, Vol. 52",[hip-hop],99,0.782,0.55,0.621,128.033,...,51664,https://www.youtube.com/watch?v=A_g3lMcWVy0&pp...,QUEVEDO || BZRP Music Sessions #52,665663525.0,204000.0,False,"Quevedo: Bzrp Music Sessions, Vol. 52 - Bizarr...","Quevedo: Bzrp Music Sessions, Vol. 52",[],../dataset/Dataset40k/youtube_data/2tTmW7RDtMQ...
2,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,[OneRepublic],I Ain’t Worried (Music From The Motion Picture...,[piano],96,0.797,0.825,0.704,139.994,...,79000,https://www.youtube.com/watch?v=mNEUkkoUoIA&pp...,OneRepublic - I Ain’t Worried (From “Top Gun: ...,324694087.0,154000.0,False,"I Ain't Worried - OneRepublic, Official, music...",I Ain’t Worried (From “Top Gun: Maverick”),"[{'tStartMs': 750, 'dDurationMs': 5965, 'segs'...",../dataset/Dataset40k/youtube_data/4h9wh7iOZ0G...
3,4LRPiXqCikLlN15c3yImP7,As It Was,[Harry Styles],As It Was,[pop],95,0.731,0.662,0.52,173.93,...,81052,https://www.youtube.com/watch?v=H5v3kku4y6Q&pp...,Harry Styles - As It Was (Official Video),762824859.0,166000.0,False,"As It Was - Harry Styles, Official, music video",As It Was,"[{'tStartMs': 14647, 'dDurationMs': 1402, 'seg...",../dataset/Dataset40k/youtube_data/4LRPiXqCikL...
4,6xGruZOHLs39ZbVccQTuPZ,Glimpse of Us,[Joji],Glimpse of Us,[pop],94,0.317,0.268,0.44,169.914,...,81102,https://www.youtube.com/watch?v=NgsWGfUlwJI&pp...,Joji - Glimpse of Us (Official Video),64139693.0,234000.0,False,"Glimpse of Us - Joji, Official, music video",Glimpse of Us,[],../dataset/Dataset40k/youtube_data/6xGruZOHLs3...
