<a href="https://colab.research.google.com/github/MLo7Ghinsan/DiffSinger_colab_notebook_MLo7/blob/main/NSF_hifigan_finetuning_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Setup**

In [1]:
#@title # Mount Google Drive and Setup
%cd /content

#ill put the imports here too ig
from IPython.display import clear_output
import os
import shutil
import yaml
from google.colab import drive
drive.mount("/content/drive")
!rm -rf /content/sample_data
!git clone https://github.com/openvpi/DiffSinger.git
!git clone https://github.com/openvpi/SingingVocoders
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install click einops h5py librosa lightning matplotlib mido numpy praat-parselmouth preprocessing pyworld PyYAML torchmetrics tqdm
#for onnx export
!pip install onnx onnxruntime onnxsim
!apt-get install aria2
clear_output()
!aria2c https://github.com/openvpi/SingingVocoders/releases/download/v0.0.2/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip
!7z x /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip -o/content/SingingVocoders/pretrained/hifigan
!rm /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip
#incase theyll add it in the future lmao
#!aria2c https://github.com/openvpi/DiffSinger/releases/download/v2.1.0/rmvpe.zip
#!7z x /content/rmvpe.zip -o/content/SingingVocoders/pretrained
#!rm /content/rmvpe.zip
clear_output()


# **Preprocess data for training**

In [None]:
#@title # Extract Data
#@markdown ___

import re
import soundfile as sf
import librosa
import os
import numpy as np

#@markdown Path to zip file containing your audio data
data_zip_path = "" # @param {type:"string"}

#@markdown Segment interval in seconds
segment_interval = 15 # @param {type:"slider", min:2, max:60, step:1}

train_path = "/content/audio_data/input"
npz_path = "/content/audio_data/output"


!rm -rf /content/audio_data >/dev/null 2>&1

if not os.path.exists(train_path):
    os.makedirs(train_path)
    os.makedirs(npz_path)
!7z e "$data_zip_path" -o{train_path} "*.wav" -r

#audio resample AND segmentation to avoid errors
def resample_and_convert_audio(audio_path, sample_rate=44100):
    audio, sr = librosa.load(audio_path, sr=None)
    duration = librosa.get_duration(y=audio, sr=sr)

    if sr != sample_rate:
        audio = librosa.resample(y=audio, orig_sr=sr, target_sr=sample_rate)

    if duration > segment_interval:
        samples_per_segment = segment_interval * sample_rate
        total_segments = int(np.ceil(duration / segment_interval))

        for segment in range(total_segments):
            start_sample = samples_per_segment * segment
            end_sample = start_sample + samples_per_segment
            if end_sample > len(audio):
                end_sample = len(audio)
            segment_audio = audio[start_sample:end_sample]

            segment_filename = f"{os.path.splitext(os.path.basename(audio_path))[0]}_segment_{segment}.wav"
            segment_path = os.path.join(os.path.dirname(audio_path), segment_filename)
            sf.write(segment_path, segment_audio, sample_rate)
        print(f"Resampled {os.path.basename(audio_path)} to {sample_rate} Hz.")
        print(f"Segmented {os.path.basename(audio_path)} into {total_segments} parts.")
    else:
        sf.write(audio_path, audio, sample_rate)
        print(f"Resampled {os.path.basename(audio_path)} to {sample_rate} Hz.")

for root, dirs, files in os.walk(train_path):
    for file in files:
        if file.endswith(".wav"):
            audio_path = os.path.join(root, file)
            resample_and_convert_audio(audio_path)


In [10]:
#@title # Edit Config
#@markdown ___

import yaml
#@markdown Model's name and save path
exp_name = "" # @param {type:"string"}
save_path = "" # @param {type:"string"}

#@markdown Pitch extractor algorithm
f0_ext = "parselmouth" # @param ["parselmouth", "harvest"]
f0_min = 120 # @param {type:"slider", min:0, max:250, step:2}
f0_max = 2100 # @param {type:"slider", min:800, max:4180, step:20}


#@markdown Precision option
precision = "16-mixed" # @param ["32-true", "bf16-mixed", "16-mixed"]

#@markdown data aug option
data_aug = False # @param {type:"boolean"}
data_aug_probability = 0.5 # @param {type:"slider", min:0.1, max:3, step:0.1}

#@markdown Amount of validation files you want to use (can't exceed the amount of train files)
val_amount = 8 # @param {type:"slider", min:1, max:18, step:1}

with open("/content/SingingVocoders/configs/ft_hifigan.yaml", "r") as config:
    ew = yaml.safe_load(config)
ew["data_input_path"] = ["/content/audio_data/input"]
ew["data_out_path"] = [save_path + "/data"]
ew["val_num"] = val_amount
ew["pe"] = f0_ext
ew["f0_min"] = f0_min
ew["f0_max"] = f0_max
if data_aug:
    ew["key_aug"] = data_aug
    ew["key_aug_prob"] = data_aug_probability
ew["pl_trainer_accelerator"] = "gpu"
ew["pl_trainer_precision"] = precision
ew["finetune_ckpt_path"] = "/content/SingingVocoders/pretrained/hifigan/nsf_hifigan_44.1k_hop512_128bin_2024.02.ckpt"
with open("/content/SingingVocoders/configs/ft_hifigan.yaml", "w") as config:
    yaml.dump(ew, config)

print("\n")





In [None]:
#@title # Preprocess
#@markdown ___

%cd /content/SingingVocoders
!python /content/SingingVocoders/process.py --config /content/SingingVocoders/configs/ft_hifigan.yaml --strx 1
%cd /content

# **Training**

In [None]:
import re
import os
import yaml

%cd /content/SingingVocoders
#@title # Training
#@markdown ___
#@markdown Change config_path to path of the config.yaml for resuming | leave blank for training from scratch
config_path = "" # @param {type:"string"}
resume_training = True # @param {type:"boolean"}

#@markdown Model save interval
save_interval = 2000 # @param {type:"slider", min:100, max:10000, step:100}
save_interval = int(save_interval / 2)

if config_path:
    config_path = config_path
else:
    config_path = "/content/SingingVocoders/configs/ft_hifigan.yaml"

training_utils_path = "/content/SingingVocoders/utils/training_utils.py"
with open(training_utils_path, "r") as f:
    edit_relative_path = f.read()
new_relative = "relative_path = filepath.relative_to(Path('/content').resolve())"
pattern = r"relative_path\s*=\s*.*"
edit_relative_path = re.sub(pattern, new_relative, edit_relative_path)
with open(training_utils_path, "w") as f:
    f.write(edit_relative_path)

with open(config_path, "r") as config:
    bitch = yaml.safe_load(config)
bitch["val_check_interval"] = save_interval #questionable
with open(config_path, "w") as config:
    yaml.dump(bitch, config)

if resume_training:
    exp_name = os.path.basename(os.path.dirname(config_path))
    save_path = os.path.dirname(os.path.dirname(config_path))
    log = save_path + "/" + exp_name
else:
    log = save_path + "/" + exp_name

logdir = log
%reload_ext tensorboard
%tensorboard --logdir {logdir}
!python /content/SingingVocoders/train.py --config {config_path} --exp_name {exp_name} --work_dir {save_path}

# **Miscellaneous**

In [None]:
#@title # Export ONNX
#@markdown ___
#test exporting because the export_ckpt.py script is you know dying
#this is a must before using diffsinger export script
import torch
import json
import os

#@markdown path to your vocoder ckpt
ckpt_path = "" # @param {type:"string"}
ckpt_folder = os.path.dirname(ckpt_path)
ckpt_config = ckpt_folder + "/config.yaml"

#@markdown your vocoder onnx save path
export_path = "" # @param {type:"string"}
save_path =  export_path + "/model.ckpt"

#@markdown your vocoder name
name = "" # @param {type:"string"}
aaa2x = torch.load(ckpt_path, map_location=torch.device("cpu"))['state_dict']
ckp = {}
for i in aaa2x:
    i: str
    if 'generator.' in i:
        ckp[i.replace('generator.', '')] = aaa2x[i]
torch.save({'generator': ckp}, save_path)

# copied from openvpi's vocoder
config_data = {
    "resblock": "1",
    "num_gpus": 4,
    "batch_size": 10,
    "learning_rate": 0.0002,
    "adam_b1": 0.8,
    "adam_b2": 0.99,
    "lr_decay": 0.999,
    "seed": 1234,
    "upsample_rates": [8, 8, 2, 2, 2],
    "upsample_kernel_sizes": [16, 16, 4, 4, 4],
    "upsample_initial_channel": 512,
    "resblock_kernel_sizes": [3, 7, 11],
    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
    "discriminator_periods": [3, 5, 7, 11, 17, 23, 37],
    "segment_size": 16384,
    "num_mels": 128,
    "num_freq": 1025,
    "n_fft": 2048,
    "hop_size": 512,
    "win_size": 2048,
    "sampling_rate": 44100,
    "fmin": 40,
    "fmax": 16000,
    "fmax_for_loss": None,
    "num_workers": 16,
    "dist_config": {
        "dist_backend": "nccl",
        "dist_url": "tcp://localhost:54321",
        "world_size": 1
    }
}
with open(f"{export_path}/config.json", "w") as json_file:
    json.dump(config_data, json_file, indent=4)

with open(ckpt_config, "r") as config:
    add_vocoder_ckpt = yaml.safe_load(config)
add_vocoder_ckpt["vocoder_ckpt"] = save_path
with open(ckpt_config, "w") as config:
    yaml.dump(add_vocoder_ckpt, config)

!cp {ckpt_config} {export_path}

!python /content/DiffSinger/scripts/export.py nsf-hifigan --config {export_path}/config.yaml --out {export_path} --name {name}