# Generate Random 500 Song Dataset using Jamendo API


In [27]:
import time
import requests
import random
import re
import pandas as pd

In [None]:
# define constants
api_url = "https://api.jamendo.com/v3.0/artists/tracks"
api_key = ""
artist_limit = 5
num_artists = 100
fullcount_artists = 45419 # enable fullcount on api call

needed_track_fields = ["album_id", "album_name", "id", "name", "duration", "album_image", "audio", "releasedate"]
track_field_rename_map = {
    "id": "song_id",
    "name": "song_name",
    "audio": "audio_link"
}

# fetch artists
def get_artists(offset):
  params = {
      "client_id": api_key,
      "limit": artist_limit,
      "offset": offset,
      "format": "json"
  }
  res = requests.get(api_url, params=params)
  res.raise_for_status()

  return res.json()["results"]


# check for valid name
def has_valid_name(name):
  return bool(re.fullmatch(r"^[a-zA-Z _]+$", name))


# get random artists
def get_random_artists():
  artist_data = []
  total_pages = fullcount_artists // artist_limit
  seen_pages = set()

  while len(artist_data) < num_artists:
      page = random.randint(0, total_pages-1)
      offset = page * artist_limit
      if offset not in seen_pages:
        seen_pages.add(offset)
        new_artists = get_artists(offset)
        for artist in new_artists:
          if has_valid_name(artist["name"]):
            artist_data.append(artist)
        time.sleep(0.5)

  return artist_data


# generate a tag for a song
song_tags = {
    "Energetic": ["energetic", "pop", "latinpop", "hiphop"],
    "Peaceful": ["peaceful", "neutral", "sentimental"],
    "Happy": ["happy", "summer", "guitar"],
    "Sad": ["sad", "film", "downtempo"],
    "Groovy": ["groovy", "jazz"],
    "Romantic": ["romantic"],
    "Dark": ["dark", "synthesizer"],
    "World": ["world", "reggae", "latin"],
    "Workout": ["aggressive", "epic", "rock"]
}

def get_album_tags(album_id):
  # fetch album info from api
  api_url_tags = "https://api.jamendo.com/v3.0/albums/musicinfo"
  params = {
      "client_id": api_key,
      "id": album_id,
      "limit": 1,
      "format": "json"
  }
  res = requests.get(api_url_tags, params=params)
  res.raise_for_status()
  return res.json()["results"][0]["musicinfo"]["tags"]

# assign a tag to a song
def tag_song(album_tags):
  for k, v in random.sample(list(song_tags.items()), len(song_tags.items())):
    for tag in v:
      if tag in album_tags:
        return k
  return random.choice(list(song_tags.keys()))


# create song dataset
def create_song_dataset(limit=500):
  song_dataset = []

  artists = get_random_artists()
  for artist in artists:
    for track in artist["tracks"]:
      if has_valid_name(track["name"]) and has_valid_name(track["album_name"]):
        new_song = {
            track_field_rename_map.get(k, k): v
            for k, v in track.items()
            if k in needed_track_fields
        }
        new_song["artist_id"] = artist["id"]
        new_song["artist_name"] = artist["name"]
        song_dataset.append(new_song)

  song_dataset = random.sample(song_dataset, min(limit, len(song_dataset)))
  df_songs = pd.DataFrame(song_dataset)
  df_songs["tag"] = None

  for album_id, songs in df_songs.groupby("album_id"):
    album_tags = get_album_tags(album_id)
    for idx, row in songs.iterrows():
      df_songs.loc[idx, "tag"] = tag_song(album_tags)
    time.sleep(0.5)

  return df_songs



In [99]:
df = create_song_dataset()

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   album_id      500 non-null    object
 1   album_name    500 non-null    object
 2   song_id       500 non-null    object
 3   song_name     500 non-null    object
 4   duration      500 non-null    object
 5   releasedate   500 non-null    object
 6   album_image   500 non-null    object
 7   audio_link    500 non-null    object
 8   artist_id     500 non-null    object
 9   artist_name   500 non-null    object
 10  artist_image  500 non-null    object
 11  tag           500 non-null    object
dtypes: object(12)
memory usage: 47.0+ KB


In [104]:
df["tag"].value_counts()

Unnamed: 0_level_0,count
tag,Unnamed: 1_level_1
Groovy,80
Workout,75
Dark,70
Peaceful,64
Happy,61
Energetic,56
Sad,48
World,26
Romantic,20


In [105]:
df.sample(5)

Unnamed: 0,album_id,album_name,song_id,song_name,duration,releasedate,album_image,audio_link,artist_id,artist_name,artist_image,tag
236,195100,Working Siesta,1777346,Cinco Rosas de Zarza,240,2020-07-07,https://usercontent.jamendo.com?type=album&id=...,https://prod-1.storage.jamendo.com/?trackid=17...,355151,kombopatras,https://usercontent.jamendo.com?type=artist&id...,Groovy
20,507617,HYBR,1997333,hormigas,396,2023-01-09,https://usercontent.jamendo.com?type=album&id=...,https://prod-1.storage.jamendo.com/?trackid=19...,355151,kombopatras,https://usercontent.jamendo.com?type=artist&id...,Groovy
121,49065,Rural Style,389087,en la vida,336,2009-07-20,https://usercontent.jamendo.com?type=album&id=...,https://prod-1.storage.jamendo.com/?trackid=38...,352085,Ki Sap,https://usercontent.jamendo.com?type=artist&id...,Groovy
108,189930,WELCOME TO MIDEST,1712608,LAS COSECHADORAS TRABAJAN ESTA NOCHE,254,2019-12-15,https://usercontent.jamendo.com?type=album&id=...,https://prod-1.storage.jamendo.com/?trackid=17...,355151,kombopatras,https://usercontent.jamendo.com?type=artist&id...,Groovy
450,396381,Breakbeat Action Sport Extreme,1745729,Breakbeat Action Sport Extreme,125,2020-03-18,https://usercontent.jamendo.com?type=album&id=...,https://prod-1.storage.jamendo.com/?trackid=17...,521514,AudioInfinity,https://usercontent.jamendo.com?type=artist&id...,Energetic


In [106]:
df.to_json("moodify_songs.json", orient="records", indent=4)

# Upload JSON to Mongo


In [10]:
import requests
import json

internal_api_url = "http://localhost:4000/api/songs"

with open("moodify_songs.json", "r") as file:
  song_data = json.load(file)

for song in song_data:
  res = requests.post(internal_api_url, json=song)
  if res.status_code != 200:
    print("Error Code: {}\nContent: {}".format(res.status_code, res.content))