Spotify Playlist Extender
This project aims to synthesize new music from a given Spotify playlist, maintaining the genre and mood of the playlist
Input a Spotify playlist url and 

In [19]:
## Imports
colab = False
try:
    # are we running on Google Colab?
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive')
    colab = True
except:
    colab = False
    pass
import os
import sys
import torch
import urllib
import requests
from IPython.display import Audio
import json
import random
import spotipy
from spotipy import oauth2
from spotipy.oauth2 import SpotifyClientCredentials
from os import path
from pydub import AudioSegment
from pydub.utils import mediainfo
import uuid
from urllib.request import urlretrieve
import numpy as np
import librosa

from models.ASTModel import AST

In [20]:
spotify_playlist_url = "https://open.spotify.com/playlist/0uwqioeN6C1NuXwjNVtY5f" # define the Spotify playlist url
playlist_search = input("Search for a playlist")
cwd = os.getcwd().replace('\\', '/')
track_path = "{}/{}{}".format(cwd, "dlas-temp", ".wav")
sample_rate = "44100"

In [21]:
## Get the text description from music genre, artist and mood
def get_text_description(genre: str, artist_name: str, mood: str):
    return ""

In [22]:
def jukebox_synthesis():
    pass

In [23]:
## Synthesize music using Riffusion 
# Based on library here: https://github.com/riffusion/riffusion
def riffusion_synthesis():
    from diffusers import DiffusionPipeline
    from riffusion.spectrogram_image_converter import SpectrogramImageConverter
    from riffusion.spectrogram_params import SpectrogramParams
    from riffusion.spectrogram_converter import SpectrogramConverter #, audio_from_spectrogram
    # from riffusion.audio import wav_bytes_from_spectrogram_image, spectrogram_from_waveform
    from io import BytesIO
    from IPython.display import Audio
    import PIL
    PIL.__version__

    pipe = DiffusionPipeline.from_pretrained("riffusion/riffusion-model-v1")
    pipe = pipe.to("cuda")
    print('hello world!')

In [24]:
## synthesize song from text description
def synthesize_song(text_description: str, genre: str, artist_name: str, mood: str, model_name: str="jukebox"):
    if model_name == "riffusion":
        riffusion_synthesis()
    else:
        jukebox_synthesis()
    pass

In [25]:
## classify the genre/mood of the song
# Based on approach from here: https://github.com/biboamy/music-repro
def get_track_details(wav_file_path: str, sample_rate: str):
    # create input audio
    sampling_rate = int(sample_rate) # get sampling rate as an integer
    seq_length = 10 * sampling_rate

    filename = wav_file_path
    audio, sr = librosa.load(filename, sr=sampling_rate)
    print('Original audio length (seconds):', len(audio) / sampling_rate)

    n_chunk = len(audio) // seq_length
    audio_chunks = []
    audio_chunks = np.split(audio[: int(n_chunk * seq_length)], n_chunk)
    audio_chunks.append(audio[-int(seq_length) :])
    audio_chunks = torch.from_numpy(np.array(audio_chunks))
    print('Input shape:', audio_chunks.shape)

    # create AST model
    os.makedirs('../../pretrained_models/', exist_ok=True) # TODO: gives error if pre-trained model isn't 2 parent directories down (some hardcoding in other script)
    model = AST(n_class=10, reprog_front='skip', map_num=5)
    ckpt_path = torch.load('./models/best_model.pth', 'cpu')
    model.load_state_dict(ckpt_path)
    model.eval()

    # predict output
    output, ori_emb, transformed_emb = model(audio_chunks)
    output = torch.sigmoid(output).detach().cpu().numpy()
    ori_emb = ori_emb.detach().cpu().numpy()
    transformed_emb = transformed_emb.detach().cpu().numpy()
    print(output.shape, ori_emb.shape, transformed_emb.shape)

    # result
    mapping = {
                0: "blues",
                1: "classical",
                2: "country",
                3: "disco",
                4: "hiphop",
                5: "jazz",
                6: "metal",
                7: "pop",
                8: "reggae",
                9: "rock",
            }

    idx = np.argmax(output.mean(0), axis=0)
    print('This song is predicted as:', mapping[idx])
    genre = mapping[idx]
    return genre
        

In [26]:
## Get a spotify track wav file from a playlist/album 
# Based on approach by: https://github.com/teticio/audio-diffusion

# convert a spotify preview url mp3 to a wav file and save to folder
def mp3_to_wav(preview_url: str):
    directory = ""
    track_name = "dlas-temp" #'dlas-track{}'.format(str(uuid.uuid1()))
    if colab:
        directory = "/content/drive/My Drive/DLAS"
    else:
        directory = os.getcwd().replace('\\', '/')
        print("dir:", directory)
        
    urlretrieve(preview_url, "{}/{}{}".format(directory, track_name, ".mp3")) # download the song preview mp3

    # assign files
    input_file = "{}/{}{}".format(directory, track_name, ".mp3")
    output_file = "{}/{}{}".format(directory, track_name, ".wav")
    
    # convert mp3 file to wav file
    sound = AudioSegment.from_mp3(input_file)
    sound.export(output_file, format="wav")
    sound_info = mediainfo(output_file)
    sample_rate = sound_info['sample_rate']

    return output_file, sample_rate


def get_track(playlist_url: str):
    # Get temporary API credentials
    credentials = requests.get("https://open.spotify.com/get_access_token?reason=transport&productType=embed").json()
    headers = {
        "Accept": "application/json",
        "Content-Type": "application/json",
        "Authorization": "Bearer " + credentials["accessToken"]
    }

    # Search for tracks
    search_string = playlist_url #input("Search: ")
    response = requests.get(f"https://api.spotify.com/v1/search?q={urllib.parse.quote(search_string)}&type=playlist", headers=headers).json()
    print(response) #(json.dumps(response, indent=2))

    # # List results
    playlists = []
    for _, playlist in enumerate(response["playlists"]["items"]):
        # print(f"{_ + 1}. {playlist['name']}")
        playlists.append(f"{_ + 1}. {playlist['name']}")

    selection = input('Select a playlist (by number): \n' + ','.join(f"{pl}\n" for pl in playlists)) # Select a playlist from the list (based on the list index)
    selected_playlist_id = response["playlists"]["items"][int(selection) - 1]["id"] # get the ID
    selected_playlist_url = response["playlists"]["items"][int(selection) - 1]["external_urls"]["spotify"]
    # print("selected playlist:", selected_playlist_url)

    # get the playlist tracks
    tracks = requests.get(f"https://api.spotify.com/v1/playlists/{selected_playlist_id}/tracks", headers=headers).json()
    tracks = tracks["items"]
    print("tracks", tracks)
    
    preview_urls = []
    artist_names = []
    artist_genres = []

    for track in tracks:
        # add track preview_url to list
        if track["track"]["preview_url"]:
            preview_urls.append(track["track"]["preview_url"])

        # add artist to list
        if track["track"]["artists"][0]["name"]:
            artist_names.append(track["track"]["artists"][0]["name"])

    selected_track = random.choice(preview_urls)
    selected_artist = random.choice(artist_names)

    print("selected track: ", selected_track)
    print("selected artist: ", selected_artist)

    audio_path, sample_rate = mp3_to_wav(preview_url=selected_track)
    print("audio file:", audio_path)
    print("sample rate:", sample_rate)

    return audio_path, sample_rate, selected_artist
    

In [27]:
## Define custom function
def extend_spotify_playlist(playlist_url: str, model_name: str="jukebox"):
    # 1. Get a track .wav file from the Spotify playlist
    # track_path, sample_rate, artist = get_track(playlist_url) # .wav filepath?

    # 2. Get the details
    # genre = get_track_details(track_path, sample_rate)

    synthesize_song(text_description="", genre="classical", artist_name="Howard Shore", mood="spooky", model_name=model_name)
    
    pass

In [28]:
# os.makedirs(f'{cwd}/pretrained_models/', exist_ok=True)
# print(os.path.exists('./models/best_model.pth'))

extend_spotify_playlist(playlist_url=playlist_search, model_name="riffusion")

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
- CompVis/stable-diffusion-v1-4 
- CompVis/stable-diffusion-v1-3 
- CompVis/stable-diffusion-v1-2 
- CompVis/stable-diffusion-v1-1 
- runwayml/stable-diffusion-v1-5 
- runwayml/stable-diffusion-inpainting 
 you should change 'sample_size' to 64 in the configuration file. Please make sure to update the config accordingly as leaving `sample_size=32` in the config might lead to incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for the `unet/config.json` file
  deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
