# riffusion colab demo

Run [riffusion](https://www.riffusion.com/about) in a gradio demo with a colab host

Riffusion project by [Seth Forsgren](https://twitter.com/sethforsgren) and [Hayk Martiros](https://github.com/hmartiro), colab notebook by [Jasper Gilley](https://twitter.com/0xjasper)

Feel free to DM Jasper on Twitter if you have any problems with the notebook

Some cool prompt ideas can be found at https://ai-art-wiki.com/wiki/Riffusion#Prompts

In [1]:
!nvidia-smi

Thu Apr 11 17:17:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 532.10                 Driver Version: 532.10       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 L...  WDDM | 00000000:01:00.0  On |                  N/A |
| N/A   38C    P5                4W /  N/A|   1537MiB /  6141MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import torch, gc
print(torch.__version__)
print(torch.version.cuda)
gc.collect()
torch.cuda.empty_cache()
torch.backends.cudnn.enabled = False
# !pip install --upgrade torch==2.0.1
# !pip install --upgrade torchaudio==2.0.1
# !pip install --upgrade torchvision==0.15.1

2.0.1+cu118
11.8


In [6]:
# @title Clone the inference repo
# !git clone https://github.com/hmartiro/riffusion-inference
# !pip install pickleshare
%cd riffusion-inference

D:\project\music-generation\riffusion-inference


Cloning into 'riffusion-inference'...


In [40]:
#@title Install requirements (you may need to restart the kernel after this)
!pip install -r requirements.txt
!pip install gradio
!pip install --upgrade pillow









Collecting pillow
  Using cached pillow-10.3.0-cp310-cp310-win_amd64.whl.metadata (9.4 kB)
Using cached pillow-10.3.0-cp310-cp310-win_amd64.whl (2.5 MB)
Installing collected packages: pillow
  Attempting uninstall: pillow
    Found existing installation: pillow 10.2.0
    Uninstalling pillow-10.2.0:
      Successfully uninstalled pillow-10.2.0
Successfully installed pillow-10.3.0




In [18]:
#@title Imports
from diffusers import DiffusionPipeline
from riffusion.spectrogram_image_converter import SpectrogramImageConverter
from riffusion.spectrogram_params import SpectrogramParams
from io import BytesIO
from IPython.display import Audio

pipe = DiffusionPipeline.from_pretrained("riffusion/riffusion-model-v1")
pipe = pipe.to("cuda")

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

In [19]:
#@title Define a `predict` function

params = SpectrogramParams()
converter = SpectrogramImageConverter(params)

def predict(prompt, negative_prompt):
    spec = pipe(
        prompt,
        negative_prompt=negative_prompt,
        width=768,
    ).images[0]

    wav = converter.audio_from_spectrogram_image(image=spec)
    wav.export('output.wav', format='wav')
    return 'output.wav', spec

In [20]:
#@title Run with Colab interface
prompt = "solo piano"#@param {type:"string"}
negative_prompt = "drums"#@param {type:"string"}

path, spec = predict(prompt, negative_prompt)

display(spec)
Audio('output.wav')

  0%|          | 0/51 [00:00<?, ?it/s]

ValueError: Expected an input with 32 mel bins. Found: 512

In [8]:
#@title Run a Gradio demo
import gradio as gr

gr.Interface(
    predict,
    inputs=["text", "text"],
    outputs=[gr.outputs.Audio(type='filepath'), gr.outputs.Image(type='pil')],
    title="Riffusion",
).launch(share=True, debug=True)

AttributeError: module 'gradio' has no attribute 'outputs'

In [None]:
#@title Upload your own files for style transfer
#@markdown #### Drop your audio files (.wav is best) in Colab's file uploader, then type the filename below and run

from google.colab import files
# uploaded = files.upload()

from scipy.io import wavfile
import numpy as np
from PIL import Image

filename = "rondo_alla_turca.wav"#@param {type:"string"}

# read uploaded file to wav
rate, data = wavfile.read(f'../{filename}')

# resample from 48000 to 44100
# from scipy.signal import resample
# data = resample(data, int(data.shape[0] * 44100 / 48000))

# convert to mono
data = np.mean(data, axis=1)

# convert to float32
data = data.astype(np.float32)

# take a random 7 second slice of the audio
data = data[rate*7:rate*14]

spectrogram = spectrogram_from_waveform(
    waveform=data,
    sample_rate=rate,
    # width=768,
    n_fft=8192,
    hop_length=512,
    win_length=8192,
)

def image_from_spectrogram(
    spectrogram: np.ndarray, max_volume: float = 50, power_for_image: float = 0.25
) -> Image.Image:
    """
    Compute a spectrogram image from a spectrogram magnitude array.
    """
    # Apply the power curve
    data = np.power(spectrogram, power_for_image)

    # Rescale to 0-255
    data = data * 255 / max_volume

    # Invert
    data = 255 - data

    # Convert to a PIL image
    image = Image.fromarray(data.astype(np.uint8))

    # Flip Y
    image = image.transpose(Image.FLIP_TOP_BOTTOM)

    # Convert to RGB
    image = image.convert("RGB")

    return image

spec = image_from_spectrogram(spectrogram)
# img.save('../rondo_alla_turca.png')
# display(img)

# Audio(data, rate=rate)

# wav2 = wav_bytes_from_spectrogram_image(img)
# with open("../inverse-spectro.wav", "wb") as f:
#     f.write(wav2[0].getbuffer())

# Audio('../inverse-spectro.wav')

In [None]:
#@title Audio-to-audio based on the previous generated sound: define new pipeline

import torch
from diffusers import StableDiffusionImg2ImgPipeline

# load the pipeline
device = "cuda"
MODEL_ID = "riffusion/riffusion-model-v1"
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16)

pipe = pipe.to(device)

#### NOTE: Colab doesn't have enough memory to simultaneously load both the base riffusion pipeline and the audio2audio pipeline. You'll need to re-run the first 'imports' cell if you want to run regular riffusion after instantiating the audio2audio pipeline in in this cell

#### you can run the audio2audio inference cell (below) as many times as you like, though

In [None]:
#@title Audio-to-audio: run inference
prompt = "epic orchestra symphony" #@param {type:"string"}

images = pipe(
    prompt=prompt,
    image=spec,
    strength=0.5,
    guidance_scale=7
).images

wav = converter.audio_from_spectrogram_image(images[0])
wav.export('audio2audio.wav', format='wav')
Audio('audio2audio.wav')