# Msanii Inference


## Setup


In [None]:
%pip install -q git+https://github.com/Kinyugo/msanii@main

## Sampling


In [None]:
from omegaconf import OmegaConf

from msanii.config import (
    Audio2AudioConfig,
    InpaintingConfig,
    InterpolationConfig,
    OutpaintingConfig,
    SamplingConfig,
)
from msanii.scripts import (
    run_audio2audio,
    run_inpainting,
    run_interpolation,
    run_outpainting,
    run_sampling,
)

In [None]:
sampling_config = {
    "ckpt_path": "<path-to-pipeline-ckpt>",
    "output_dir": "<path-to-output-directory>",
    "batch_size": 4,
    "num_frames": 8_387_584,  # should divisible by the downsampling factor of the U-Net
    "output_audio_format": "wav",  # ogg, mp3 ...
    "seed": 0,
    "device": "cuda",  # cpu or cuda
    "dtype": "float16",  # torch.dtype
    "num_inference_steps": 20,
    "verbose": True,
    "use_neural_vocoder": True,
    "channels": 2,  # mono or stereo
    "num_samples": 16,
}

In [None]:
sampling_config = OmegaConf.create(sampling_config)
sampling_config = OmegaConf.merge(sampling_config, SamplingConfig)

In [None]:
run_sampling(sampling_config)

## Audio2Audio


In [None]:
audio2audio_config = {
    "ckpt_path": "<path-to-pipeline-ckpt>",
    "output_dir": "<path-to-output-directory>",
    "batch_size": 4,
    "num_frames": 8_387_584,  # should divisible by the downsampling factor of the U-Net
    "output_audio_format": "wav",  # ogg, mp3 ...
    "seed": 0,
    "device": "cuda",  # cpu or cuda
    "dtype": "float16",  # torch.dtype
    "num_inference_steps": 20,
    "verbose": True,
    "use_neural_vocoder": True,
    "data_dir": "<path-to-folder-with-audio-files>",
    "num_workers": 4,
    "pin_memory": True,
    "strength": 0.1,  # controls how much noise is added; [0, 1]
}

In [None]:
audio2audio_config = OmegaConf.create(audio2audio_config)
audio2audio_config = OmegaConf.merge(audio2audio_config, Audio2AudioConfig)

In [None]:
run_audio2audio(audio2audio_config)

## Interpolation


In [None]:
interpolation_config = {
    "ckpt_path": "<path-to-pipeline-ckpt>",
    "output_dir": "<path-to-output-directory>",
    "batch_size": 4,
    "num_frames": 8_387_584,  # should divisible by the downsampling factor of the U-Net
    "output_audio_format": "wav",  # ogg, mp3 ...
    "seed": 0,
    "device": "cuda",  # cpu or cuda
    "dtype": "float16",  # torch.dtype
    "num_inference_steps": 20,
    "verbose": True,
    "use_neural_vocoder": True,
    "first_data_dir": "<path-to-folder-with-audio-files>",
    "second_data_dir": "<path-to-folder-with-audio-files>",
    "num_workers": 4,
    "pin_memory": True,
    "ratio": 0.5,  # controls how much of the first sample is in the interpolation
    "strength": 0.1,  # controls how much noise is added; [0, 1]
}

In [None]:
interpolation_config = OmegaConf.create(interpolation_config)
interpolation_config = OmegaConf.merge(interpolation_config, InterpolationConfig)

In [None]:
run_interpolation(interpolation_config)

## Inpainting


In [None]:
inpainting_config = {
    "ckpt_path": "<path-to-pipeline-ckpt>",
    "output_dir": "<path-to-output-directory>",
    "batch_size": 4,
    "num_frames": 8_387_584,  # should divisible by the downsampling factor of the U-Net
    "output_audio_format": "wav",  # ogg, mp3 ...
    "seed": 0,
    "device": "cuda",  # cpu or cuda
    "dtype": "float16",  # torch.dtype
    "num_inference_steps": 20,
    "verbose": True,
    "use_neural_vocoder": True,
    "data_dir": "<path-to-folder-with-audio-files>",
    "num_workers": 4,
    "pin_memory": True,
    "masks": [],  # e.g ["3-5,10-50","4-10", ...] for each sample if the folder,
    "eta": 0.0,
    "jump_length": 10,
    "jump_n_sample": 10,
}

In [None]:
inpainting_config = OmegaConf.create(inpainting_config)
inpainting_config = OmegaConf.merge(inpainting_config, InpaintingConfig)

In [None]:
run_inpainting(inpainting_config)

## Outpainting


In [None]:
outpainting_config = {
    "ckpt_path": "<path-to-pipeline-ckpt>",
    "output_dir": "<path-to-output-directory>",
    "batch_size": 4,
    "num_frames": 8_387_584,  # should divisible by the downsampling factor of the U-Net
    "output_audio_format": "wav",  # ogg, mp3 ...
    "seed": 0,
    "device": "cuda",  # cpu or cuda
    "dtype": "float16",  # torch.dtype
    "num_inference_steps": 20,
    "verbose": True,
    "use_neural_vocoder": True,
    "data_dir": "<path-to-folder-with-audio-files>",
    "num_workers": 4,
    "pin_memory": True,
    "num_spans": 2,  # number of half the num_frames outpaints
    "eta": 0.0,
    "jump_length": 10,
    "jump_n_sample": 10,
}

In [None]:
outpainting_config = OmegaConf.create(outpainting_config)
outpainting_config = OmegaConf.merge(outpainting_config, OutpaintingConfig)

In [None]:
run_outpainting(outpainting_config)