# Import the relevant libraries

In [45]:
import pandas as pd
from creapy import creapy
import plotly
from pathlib import Path
import sys
import logging
import os
import time
import librosa
import numpy as np
import torch
from scipy.io.wavfile import write
from tqdm import tqdm
import utils
from models import SynthesizerTrn
from speaker_encoder.voice_encoder import SpeakerEncoder
from wavlm import WavLM, WavLMConfig
from datetime import datetime
import IPython.display as ipd 
import json
from praatio import textgrid as tg
from praatio.utilities.constants import Interval

## Load the models and helper functions
This includes setting location of the downloaded checkpoint off WavLM and the FreeVC model to be used

In [138]:
wavlm_large_path = 'wavlm/WavLM-Large.pt'
freevc_chpt_path = '/nfs/deepspeech/home/lameris/libri_cpps/G_750000.pth'
creapy.set_config(creak_threshold=0.5)

def arg_creator(source, target, outpath, creak_values, cpps_values):
    return [source, target, outpath, creak_values, cpps_values]

def get_cmodel():
    checkpoint = torch.load(wavlm_large_path)
    cfg = WavLMConfig(checkpoint['cfg'])
    cmodel = WavLM(cfg)
    cmodel.load_state_dict(checkpoint['model'])
    cmodel.eval()

    return cmodel

def generate_timestamp():
    # Get the current date and time
    now = datetime.now()

    # Format the date and time as a string suitable for a filename
    #timestamp = now.strftime("%Y%m%d_%H%M%S")
    timestamp = now.strftime("%Y%m%d_%H%M%S_") + str(int(now.microsecond / 1000)).zfill(2)
    # Return the formatted timestamp with the desired file extension
    return timestamp

def convert(args):
    print("converting...")
    wav_tgt1, _ = librosa.load(args[1], sr=hps.data.sampling_rate)
    #convert to mono if necessary
    if len(wav_tgt1.shape) > 1:
        wav_tgt1 = wav_tgt1.mean(axis=0)
    wav_tgt1, _ = librosa.effects.trim(wav_tgt1, top_db=20)
    g_tgt1 = smodel.embed_utterance(wav_tgt1)
    g_tgt = torch.from_numpy(g_tgt1).unsqueeze(0)

    # src
    wav_src, _ = librosa.load(args[0], sr=hps.data.sampling_rate)
    #convert to mono if necessary
    if len(wav_src.shape) > 1:
        wav_src = wav_src.mean(axis=0)
        
    wav_src = torch.from_numpy(wav_src).unsqueeze(0)

    c = utils.get_content(cmodel, wav_src)
    try:
        tgt_audio = net_g.infer(c, g=g_tgt, creaks=torch.tensor(np.zeros((1, 1, (wav_src.size(1)//320-1))), dtype=torch.float32)+args[3], cpps=torch.tensor(np.zeros((1, 1, (wav_src.size(1)//320-1))), dtype=torch.float32)+args[4])
    except:
        tgt_audio = net_g.infer(c, g=g_tgt, creaks=torch.tensor(np.zeros((1, 1, (wav_src.size(1)//320))), dtype=torch.float32)+args[3], cpps=torch.tensor(np.zeros((1, 1, (wav_src.size(1)//320))), dtype=torch.float32)+args[4])
    tgt_audio = tgt_audio[0][0].data.cpu().float().numpy()

    timestamp = generate_timestamp()
    print(timestamp)
    ipd.display(ipd.Audio(tgt_audio, rate=hps.data.sampling_rate))
    write(args[2], hps.data.sampling_rate, tgt_audio)
    return tgt_audio

def create_creak_tensor(wav_len, creak_segments, sr):
    #create tensor torch.tensor(np.zeros((1, 1, (wav_src.size(1)//320-1))), dtype=torch.float32)) and then add the creak value to the tensor between the start and end frames
    #creak_start and creak_end are in seconds
    #start and end are in seconds

    #convert start and end to frames
    end_frame = (wav_len // 320)
    creak_frames = [(int(item[0] * sr // 320), int(item[1] * sr // 320), item[2]) for item in creak_segments]
    creaks = torch.tensor(np.zeros((1, 1, (end_frame))), dtype=torch.float32)

    for creak in creak_frames:
        creaks[0, 0, creak[0]:creak[1]] = creak[2]
    return creaks

def json_to_textgrid(json_path, textgrid_path):
    with open(json_path) as f:
        data = json.load(f)
    start_speech = data['segments'][0]['start']
    end_speech = data['segments'][0]['end']
    timestamps = data['segments'][0]['words']

    textgrid = tg.Textgrid()
    tier = tg.IntervalTier('word', [], start_speech, end_speech)

    new_timestamps = []
    for i, timestamp in enumerate(timestamps):
        prev_timestamp = timestamps[i-1] if i > 0 else None
        if prev_timestamp:
            if prev_timestamp['end'] == timestamp['start']:
                start = prev_timestamp['end']
            else:
                start = prev_timestamp['end'] + 0.01
            end = timestamp['end']
            new_timestamps.append({'start': start, 'end': end, 'word': timestamp['word']})
            tier.insertEntry(Interval(start, end, timestamp['word']))
        else:
            tier.insertEntry(Interval(timestamp['start'], timestamp['end'], timestamp['word']))
            new_timestamps.append({'start': timestamp['start'], 'end': timestamp['end'], 'word': timestamp['word']})
    textgrid.addTier(tier)
    textgrid.save(textgrid_path, format='long_textgrid', includeBlankSpaces=False)
    return new_timestamps

hps = utils.get_hparams_from_file('configs/freevc.json')

net_g = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model
)

utils.load_checkpoint(freevc_chpt_path, net_g, optimizer=None, strict=True)
cmodel = get_cmodel()
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt', device='cpu')

INFO:root:Loaded checkpoint '/nfs/deepspeech/home/lameris/libri_cpps/G_750000.pth' (iteration 1642)
INFO:wavlm.WavLM:WavLM Config: {'extractor_mode': 'layer_norm', 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': 'gelu', 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'normalize': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.0, 'dropout_input': 0.0, 'dropout_features': 0.0, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': 'static', 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'relative_position_embedding': True, 'num_

## Set the arguments
Here we set the following arguments for the creaky voice conversion:

1. **source_path** indicates the audio file of which we want the linguistic content.
2. **target_path** indicates the audio file containing speech of the target speaker.
3. **outpath** is the location where the converted audio will be saved.
4. **average_creak** is the initial creak value over the complete utterance that will be supplied to the model 

We also create the output folder specified in the arguments


In [127]:
source_path = "./test_audio/prag.wav"
target_path = "./test_audio/enhanced_extra_voice.wav"
outpath = "data/out/creaky_vctk_male_combined.wav"
average_creak = 0
average_cpps = -1

args = arg_creator(source_path, target_path, outpath, average_creak, average_cpps)
os.makedirs('/'.join(args[2].split('/')[:-1]), exist_ok=True)
print(args)

['./test_audio/prag.wav', './test_audio/enhanced_extra_voice.wav', 'data/out/creaky_vctk_male_combined.wav', 0, -1]


In [148]:
source_path = "/home/lameris/CreakVC/test_audio/16000_female_voice_segment.wav"
target_path = "/home/lameris/CreakVC/test_audio/16000_female_target_end.wav"
outpath = "data/out/creaky_vctk_female.wav"
average_creak = -7
average_cpps = -15


args = arg_creator(source_path, target_path, outpath, average_creak, average_cpps)
os.makedirs('/'.join(args[2].split('/')[:-1]), exist_ok=True)

In [111]:
ipd.display(ipd.Audio(args[0]))
ipd.display(ipd.Audio(args[1]))

# Perform the conversion

In [149]:
creaky_audio = convert(args)

converting...
20240708_143337_225


In [147]:
average_creak = 0
average_cpps = 0

sr = hps.data.sampling_rate
new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 4, -1)], sr)
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0, 4, -1)], sr)
#new_creak = average_creak
args = arg_creator(source_path, target_path, outpath, new_creak, new_cpps)
new_creaky_audio = convert(args)

converting...
20240708_143326_184


In [16]:
average_creak = 0
average_cpps = 0

sr = hps.data.sampling_rate
new_cpps = create_creak_tensor(creaky_audio.shape[0], [(2.5, 5, -2)], sr)
new_creak = create_creak_tensor(creaky_audio.shape[0], [(2.5, 5, -1)], sr)
args = arg_creator(source_path, target_path, outpath, new_creak, new_cpps)
new_creaky_audio = convert(args)

converting...
20240708_113109_492


In [10]:
average_creak = 0
average_cpps = 0

sr = hps.data.sampling_rate
new_creak = create_creak_tensor(creaky_audio.shape[0], [(2.5, 5, 4)], sr)
args = arg_creator(source_path, target_path, outpath, new_creak, average_cpps)
new_creaky_audio = convert(args)

converting...


20240708_112517_120


## Run WhisperX
* We run speech recognition to transcribe the generated utterance
* We convert the timestamps from json to TextGrid in order to use prepare them for CreaPy 

In [None]:
#run whisper
!whisperx "data/out/creaky_vctk_liam_cpp.wav" --model medium.en --output_dir data/out/creaky_vctk_whisper --language en --verbose=False

#read json file with the start and end times of the words
timestamps = json_to_textgrid('data/out/creaky_vctk_whisper/creaky_vctk_liam_cpp.json', 'data/out/creaky_vctk_whisper/creaky_vctk_liam_cpp.TextGrid')
print(timestamps)

## Run CreapPy
This enables us to quantify and visualize the creak probability

In [None]:
X_test, y_pred, sr = creapy.process_file(textgrid_path='./data/out/creaky_vctk_whisper/creaky_vctk_liam_cpp.TextGrid', audio_path='./data/out/creaky_vctk_liam_cpp.wav')

## Smoothen the output

In [None]:
y_pred_smoothed = np.convolve(y_pred, np.ones(10)/10, mode='same')

## Plot the creak

In [None]:
fig = creapy.plot(X_test, y_pred_smoothed, sr, words=timestamps)

In [None]:
#new_creaks = create_creak_tensor(creaky_audio.shape[0], [(0.589, 0.81, -4), (0.81, 1.04, -10), (1.19, 1.49, -14), (1.87, 2.0, -10), (2.0, 2.98, 5)], sr)
new_creaks = create_creak_tensor(creaky_audio.shape[0], [(0.01, 6.7, 40)], sr)
args[-1] = new_creaks

new_creaky_audio = convert(args)

In [None]:
#run creapy on output audio
X_test, y_pred, sr, = creapy.process_file(textgrid_path='./data/out/creaky_vctk_whisper/creaky_vctk_liam.TextGrid', audio_path='./data/out/creaky_vctk_liam.wav')
y_pred_smoothed = np.convolve(y_pred, np.ones(10)/10, mode='same')
fig2 = creapy.plot(X_test, y_pred_smoothed, sr, words=timestamps)

In [None]:
#run creapy on output audio
X_test, y_pred, sr, = creapy.process_file(textgrid_path='./data/out/creaky_vctk_whisper/creaky_vctk_liam.TextGrid', audio_path='./data/out/creaky_vctk_liam.wav')
y_pred_smoothed = np.convolve(y_pred, np.ones(10)/10, mode='same')
fig2 = creapy.plot(X_test, y_pred_smoothed, sr, words=timestamps)
