# Import the relevant libraries

In [2]:
import pandas as pd
import re
from creapy import creapy
import plotly
from pathlib import Path
import sys
import logging
import os
import time
import librosa
import numpy as np
import torch
from scipy.io.wavfile import write
import utils
from models import SynthesizerTrn
from speaker_encoder.voice_encoder import SpeakerEncoder
from wavlm import WavLM, WavLMConfig
from datetime import datetime
import IPython.display as ipd 
import json
import soundfile as sf
from praatio import textgrid as tg
from praatio.utilities.constants import Interval
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")



## Load the models and helper functions
This includes setting location of the downloaded checkpoint off WavLM and the FreeVC model to be used

In [3]:
wavlm_large_path = 'wavlm/WavLM-Large.pt'
freevc_chpt_path = 'logs/libri_train_only/G_2344000.pth'


def arg_creator(source, target, outpath, creak_values, cpps_values, h1h2_values, pitch_values, h1a3_values, pitch_var_values):
    return [source, target, outpath, creak_values, cpps_values, h1h2_values, pitch_values, h1a3_values, pitch_var_values]

def get_cmodel():
    checkpoint = torch.load(wavlm_large_path)
    cfg = WavLMConfig(checkpoint['cfg'])
    cmodel = WavLM(cfg)
    cmodel.load_state_dict(checkpoint['model'])
    cmodel.eval()

    return cmodel.to(device)

def combined_vq(current_args, pitch, pitch_var, audio, sr=16000, device='cpu'):
    tensor = utils.get_content(cmodel, torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device))
    tensor_size = tensor.shape[-1]+1

    def get_time_ranges_and_intensities(voice_quality):
        """
        Get time range input as tuples and corresponding intensity values.
        """
        ranges_input = input(f"Enter the time ranges for {voice_quality} (e.g., (0, 0.4), (0.5, 1.9)), or press enter for full clip:").strip()

        if not ranges_input:
            return [((0, -1), 0)]  # Default full clip range with intensity 0

        # Improved regex: allows flexible spaces and missing final comma
        time_ranges = re.findall(r'\(\s*([\d.-]+)\s*,\s*([\d.-]+)\s*\)', ranges_input)
        
        if not time_ranges:
            print("Invalid format. Please enter ranges like (0, 0.4), (0.5, 1.9).")
            return get_time_ranges_and_intensities(voice_quality)  # Ask again

        # Convert to float and validate order
        parsed_ranges = []
        for start, end in time_ranges:
            start, end = float(start), float(end)
            if start < 0 or end < 0:
                print(f"Invalid time range ({start}, {end}): Negative values are not allowed.")
                return get_time_ranges_and_intensities(voice_quality)
            if start >= end:
                print(f"Invalid time range ({start}, {end}): Start must be less than end.")
                return get_time_ranges_and_intensities(voice_quality)
            parsed_ranges.append((start, end))

        # Get intensities for each range
        intensities = []
        for time_range in parsed_ranges:
            while True:
                try:
                    intensity_input = input(f"Select the intensity of {voice_quality} for range {time_range} (0 to 5): ").strip()
                    intensity = float(intensity_input) if intensity_input else 0
                    if 0 <= intensity <= 5:
                        break
                    else:
                        print("Intensity must be between 0 and 5.")
                except ValueError:
                    print("Invalid input. Please enter a number between 0 and 5.")

            intensities.append(intensity)

        return list(zip(parsed_ranges, intensities))


    breathy_segments = get_time_ranges_and_intensities("breathiness")
    creaky_segments = get_time_ranges_and_intensities("creakiness")
    nasal_segments = get_time_ranges_and_intensities("nasality")

    print(f'Breathiness segments: {breathy_segments}')
    print(f'Creakiness segments: {creaky_segments}')
    print(f'Nasality segments: {nasal_segments}')
    print('-------------------------------------------------')

    # Apply transformations for each voice quality using its time ranges and intensities
    for time_range, intensity in nasal_segments:
        if intensity > 0:
            current_args = vq_nasal(current_args, tensor_size, intensity, time_range, sr, device=device)
    
    for time_range, intensity in breathy_segments:
        if intensity > 0:
            current_args = vq_breathy(current_args, tensor_size, intensity, time_range, sr, device=device)

    for time_range, intensity in creaky_segments:
        if intensity > 0:
            current_args = vq_creaky(current_args, tensor_size, intensity, time_range, sr, device=device)

    # Update pitch and pitch_var in the final transformed args
    current_args[6] = pitch
    current_args[8] = pitch_var

    # Print current averages
    print(f"Current average creak: {torch.mean(current_args[3]).item() if isinstance(current_args[3], torch.Tensor) else current_args[3]} \n"
          f"Current average CPPS: {torch.mean(current_args[4]).item() if isinstance(current_args[4], torch.Tensor) else current_args[4]} \n"
          f"Current average H1-H2: {torch.mean(current_args[5]).item() if isinstance(current_args[5], torch.Tensor) else current_args[5]} \n"
          f"Current average pitch: {torch.mean(current_args[6]).item() if isinstance(current_args[6], torch.Tensor) else current_args[6]} \n"
          f"Current average H1-A3: {torch.mean(current_args[7]).item() if isinstance(current_args[7], torch.Tensor) else current_args[7]} \n"
          f"Current average pitch_var: {torch.mean(current_args[8]).item() if isinstance(current_args[8], torch.Tensor) else current_args[8]} \n"
          '-------------------------------------------------')

    # Convert the final transformed args (if necessary)
    convert(current_args)

    return current_args


def to_tensor(x, audio_len, device='cpu'):
    """Converts the input to a tensor on the given device and prints its device."""
    if isinstance(x, (int, float)):
        tensor = torch.full((audio_len,), x, dtype=torch.float32, device=device)
    else:
        tensor = x.to(device)
    return tensor


def vq_breathy(args, audio_len, scaling_factor, time_range, sr, device='cpu'):
    if scaling_factor == 0:
        return args  # Skip processing if intensity is zero
    start_frame, end_frame = (0, audio_len) if time_range == (0, -1) else (
        int(time_range[0] * sr // 320),
        min(int(time_range[1] * sr // 320), audio_len)
    )

    # Ensure tensors are on the correct device and print device info
    creak, cpps, h1h2, h1a3 = map(lambda x: to_tensor(x, audio_len, device), [args[3], args[4], args[5], args[7]])

    breathy_100 = {
        "creak": torch.full((audio_len,), -2.0, dtype=torch.float32, device=device),
        "cpps": torch.full((audio_len,), -1.0, dtype=torch.float32, device=device),
        "h1h2": torch.full((audio_len,), 3.0, dtype=torch.float32, device=device),
        "h1a3": torch.full((audio_len,), 3.0, dtype=torch.float32, device=device)
    }

    # Ensure that the tensors are properly on the same device and adjust them
    for param, target in zip([creak, cpps, h1h2, h1a3], breathy_100.values()):
        param[start_frame:end_frame] = param[start_frame:end_frame].float() + \
            (target[start_frame:end_frame].float() - param[start_frame:end_frame].float()) * scaling_factor

    return [args[0], args[1], args[2], creak, cpps, h1h2, args[6], h1a3, args[8]]


def vq_creaky(args, audio_len, scaling_factor, time_range, sr, device='cpu'):
    if scaling_factor == 0:
        return args

    audio_len = int(audio_len)

    start_frame, end_frame = (0, audio_len) if time_range == (0, -1) else (
        int(time_range[0] * sr // 320),
        min(int(time_range[1] * sr // 320), audio_len)
    )

    # Ensure tensors are on the correct device and print device info
    creak, cpps, h1h2, h1a3 = map(lambda x: to_tensor(x, audio_len, device), [args[3], args[4], args[5], args[7]])

    creaky_100 = {
        "creak": torch.full((audio_len,), 3.0, dtype=torch.float32, device=device),
        "cpps": torch.full((audio_len,), -1.0, dtype=torch.float32, device=device),
        "h1h2": torch.full((audio_len,), -2.0, dtype=torch.float32, device=device),
        "h1a3": torch.full((audio_len,), -2.0, dtype=torch.float32, device=device)
    }

    # Ensure that the tensors are properly on the same device and adjust them
    for param, target in zip([creak, cpps, h1h2, h1a3], creaky_100.values()):
        param[start_frame:end_frame] = param[start_frame:end_frame].float() + \
            (target[start_frame:end_frame].float() - param[start_frame:end_frame].float()) * scaling_factor

    return [args[0], args[1], args[2], creak, cpps, h1h2, args[6], h1a3, args[8]]


def vq_nasal(args, audio_len, scaling_factor, time_range, sr, device='cpu'):
    if scaling_factor == 0:
        return args

    audio_len = int(audio_len)

    start_frame, end_frame = (0, audio_len) if time_range == (0, -1) else (
        int(time_range[0] * sr // 320),
        min(int(time_range[1] * sr // 320), audio_len)
    )

    # Ensure tensors are on the correct device and print device info
    creak, cpps, h1h2, h1a3 = map(lambda x: to_tensor(x, audio_len, device), [args[3], args[4], args[5], args[7]])

    nasal_100 = {
        "creak": torch.full((audio_len,), 0.0, dtype=torch.float32, device=device),
        "cpps": torch.full((audio_len,), 1.0, dtype=torch.float32, device=device),
        "h1h2": torch.full((audio_len,), -3.0, dtype=torch.float32, device=device),
        "h1a3": torch.full((audio_len,), 3.0, dtype=torch.float32, device=device)
    }

    # Ensure that the tensors are properly on the same device and adjust them
    for param, target in zip([creak, cpps, h1h2, h1a3], nasal_100.values()):
        param[start_frame:end_frame] = param[start_frame:end_frame].float() + \
            (target[start_frame:end_frame].float() - param[start_frame:end_frame].float()) * scaling_factor

    return [args[0], args[1], args[2], creak, cpps, h1h2, args[6], h1a3, args[8]]

def reset_args(args):
    return [args[0], args[1], args[2], 0, 0, 0, 0, 0, 0]


def generate_timestamp():
    now = datetime.now()
    timestamp = now.strftime("%Y%m%d_%H%M%S_") + str(int(now.microsecond / 1000)).zfill(2)
    return timestamp

def convert(args):
    print("converting...")
    wav_tgt1, _ = librosa.load(args[1], sr=hps.data.sampling_rate)
    wav_tgt1, _ = librosa.effects.trim(wav_tgt1, top_db=20)
    g_tgt1 = smodel.embed_utterance(wav_tgt1)
    g_tgt = torch.from_numpy(g_tgt1).unsqueeze(0).to(device)

    # src
    wav_src, _ = librosa.load(args[0], sr=hps.data.sampling_rate, mono=True)
    wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)

    c = utils.get_content(cmodel, wav_src)
    basic_tensor = torch.tensor(np.zeros((1, 1, c.shape[-1])), dtype=torch.float32, device=device)
    print(basic_tensor.shape)
    try:
        tgt_audio = net_g.infer(
                        c,
                        g=g_tgt,
                        creaks=basic_tensor + args[3],
                        cpps=basic_tensor + args[4],
                        h1h2s=basic_tensor + args[5],
                        pitches=basic_tensor + args[6],
                        h1a3s=basic_tensor + args[7],
                        pitch_vars = basic_tensor + args[8]
                    )
    except Exception as e:
        print('First dimensionality did not work:', e)
    try:
        basic_tensor = torch.tensor(np.zeros((1, 1, c.shape[-1]-1)), dtype=torch.float32, device=device)
        print(basic_tensor.shape)
        tgt_audio = net_g.infer(
                        c,
                        g=g_tgt,
                        creaks=basic_tensor + args[3],
                        cpps=basic_tensor + args[4],
                        h1h2s=basic_tensor + args[5],
                        pitches=basic_tensor + args[6],
                        h1a3s=basic_tensor + args[7],
                        pitch_vars = basic_tensor + args[8]
                    )
    except Exception as e:
        #print('Second dimensionality did not work:', e)
        pass
    tgt_audio = tgt_audio[0][0].data.cpu().float().numpy()

    timestamp = generate_timestamp()
    print(timestamp)
    ipd.display(ipd.Audio(tgt_audio, rate=hps.data.sampling_rate))
    write(args[2], hps.data.sampling_rate, tgt_audio)
    return tgt_audio

def create_vq_tensor(wav_len, vq_segments, sr):
    #create tensor torch.tensor(np.zeros((1, 1, (wav_src.size(1)//320-1))), dtype=torch.float32)) and then add the creak value to the tensor between the start and end frames
    #creak_start and creak_end are in seconds
    #start and end are in seconds

    #convert start and end to frames
    end_frame = (wav_len // 320)
    vq_frames = [(int(item[0] * sr // 320), int(item[1] * sr // 320), item[2]) for item in vq_segments]
    vqs = torch.tensor(np.zeros((1, 1, (end_frame))), dtype=torch.float32)
    for creak in vq_frames:
        vqs[0, 0, creak[0]:creak[1]] = creak[2]
    return vqs

def json_to_textgrid(json_path, textgrid_path):
    with open(json_path) as f:
        data = json.load(f)
    start_speech = data['segments'][0]['start']
    end_speech = data['segments'][-1]['end']
    timestamps = [data['segments'][i]['words'] for i in range(len(data['segments']))] 
    timestamps = [item for sublist in timestamps for item in sublist]
    textgrid = tg.Textgrid()
    tier = tg.IntervalTier('word', [], start_speech, end_speech)

    new_timestamps = []
    for i, timestamp in enumerate(timestamps):
        prev_timestamp = timestamps[i-1] if i > 0 else None
        if prev_timestamp:
            if prev_timestamp['end'] == timestamp['start']:
                start = prev_timestamp['end']
            else:
                start = prev_timestamp['end'] + 0.01
            end = timestamp['end']
            new_timestamps.append({'start': start, 'end': end, 'word': timestamp['word']})
            tier.insertEntry(Interval(start, end, timestamp['word']))
        else:
            tier.insertEntry(Interval(timestamp['start'], timestamp['end'], timestamp['word']))
            new_timestamps.append({'start': timestamp['start'], 'end': timestamp['end'], 'word': timestamp['word']})
    textgrid.addTier(tier)
    textgrid.save(textgrid_path, format='long_textgrid', includeBlankSpaces=False)
    return new_timestamps

hps = utils.get_hparams_from_file('configs/freevc.json')

net_g = SynthesizerTrn(
   hps.data.filter_length // 2 + 1,
   hps.train.segment_size // hps.data.hop_length,
   **hps.model
)

utils.load_checkpoint(freevc_chpt_path, net_g, optimizer=None, strict=True)
cmodel = get_cmodel()
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt', device=device)



INFO:root:Loaded checkpoint 'logs/libri_train_only/G_2344000.pth' (iteration 1444)
INFO:wavlm.WavLM:WavLM Config: {'extractor_mode': 'layer_norm', 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': 'gelu', 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'normalize': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.0, 'dropout_input': 0.0, 'dropout_features': 0.0, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': 'static', 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'relative_position_embedding': True, 'num_buckets': 320, 'm

## Set the arguments
Here we set the following arguments for the creaky voice conversion:

1. **source_path** indicates the audio file of which we want the linguistic content.
2. **target_path** indicates the audio file containing speech of the target speaker.
3. **outpath** is the location where the converted audio will be saved.
4. **average_feature** is the initial feature value over the complete utterance that will be supplied to the model 

We also create the output folder specified in the arguments


## Instructions
1. Select the source path that you want to convert by uncommenting it. 
2. Select your desired feature values. I recommend starting with all zeros, except for pitch where I recommend starting at -1.
3. Listen to the original audio in the cell below the manipulation cell.
4. Perform the conversion.
5. Decide if you like the prosody.
6. If not, change the feature values in the coarse-grained editing cell.



## Manipulation cell

In [4]:
#source_path = "../game_outputs/game_before_1_5.wav"
#source_path = "../game_outputs/lava_boy_2_4.wav"
#source_path = "../game_outputs/arrow_keys_3_3.wav"
#source_path = "../game_outputs/opposite_4_4.wav"
#source_path = "../game_outputs/into_fire_5_4.wav"
#source_path = "../game_outputs/walk_down_6_5.wav"
#source_path = "../game_outputs/both_die_7_5.wav"
#source_path = "../game_outputs/jump_8_alt_3.wav" 
#source_path = "../game_outputs/there_you_go_9_3.wav"
#source_path = "../game_outputs/introductory_10_alt.wav"
#source_path = "../game_outputs/button_ledge_11_alt.wav"
source_path = "../game_outputs/stay_let_go_12_alt_2.wav"
#source_path = "../game_outputs/touch_get_up_13_3.wav"
#source_path = "../game_outputs/come_on_down_14_2.wav"
#source_path = "../game_outputs/hang_15_3.wav"
source_path = "../game_outputs/nah_mario_16.wav"
#source_path = "../game_outputs/mario_up_17_5.wav"
#source_path = "../game_outputs/right_next_level_18.wav"


target_path = "game_20_target-enhanced-v2.wav"
outpath = f"./data/experiment/{source_path.split('/')[-1]}"

average_creak = 0
average_cpps = 0
average_h1h2 = 0
average_pitch = -1.5
average_h1a3 = 0
average_pitch_var = 0

sr=16000
args = arg_creator(source_path, target_path, outpath, average_creak, average_cpps, average_h1h2, average_pitch, average_h1a3, average_pitch_var)
os.makedirs('/'.join(args[2].split('/')[:-1]), exist_ok=True)



src_audio = sf.read(source_path)[0]
#display the audio
ipd.display(ipd.Audio(src_audio, rate=24000))


# Perform the conversion

In [None]:
filepaths = [
    "../game_outputs/game_before_1_5.wav",
    "../game_outputs/lava_boy_2_4.wav",
    "../game_outputs/arrow_keys_3_3.wav",
    "../game_outputs/opposite_4_4.wav",
    "../game_outputs/into_fire_5_4.wav",
    "../game_outputs/walk_down_6_5.wav",
    "../game_outputs/both_die_7_5.wav",
    "../game_outputs/jump_8_alt_3.wav",
    "../game_outputs/there_you_go_9_3.wav",
    "../game_outputs/introductory_10_alt.wav",
    "../game_outputs/button_ledge_11_alt.wav",
    "../game_outputs/stay_let_go_12_alt_2.wav",
    "../game_outputs/touch_get_up_13_3.wav",
    "../game_outputs/come_on_down_14_2.wav",
    "../game_outputs/hang_15_3.wav",
    "../game_outputs/nah_mario_16.wav",
    "../game_outputs/mario_up_17_5.wav",
    "../game_outputs/right_next_level_18.wav"
]

for item in filepaths:
    outpath = item.replace('.wav', '_nasal.wav')
    outpath = outpath.replace('/game_outputs/', '/modifications/')
    print(outpath)
    average_creak = 0
    average_cpps = 1
    average_h1h2 = -3
    average_h1a3 = 3
    average_pitch = -1.5
    average_pitch_var= 0
    target_path = "game_20_target-enhanced-v2.wav"
    args = arg_creator(item, target_path, outpath, average_creak, average_cpps, average_h1h2, average_pitch, average_h1a3, average_pitch_var)
    convert(args)

../modifications/game_before_1_5_nasal.wav
converting...
torch.Size([1, 1, 134])
torch.Size([1, 1, 133])
20250416_163701_601


../modifications/lava_boy_2_4_nasal.wav
converting...
torch.Size([1, 1, 175])
torch.Size([1, 1, 174])
20250416_163702_415


../modifications/arrow_keys_3_3_nasal.wav
converting...
torch.Size([1, 1, 115])
torch.Size([1, 1, 114])
20250416_163703_62


../modifications/opposite_4_4_nasal.wav
converting...


In [76]:
converted_audio = convert(args)


converting...
torch.Size([1, 1, 120])
torch.Size([1, 1, 119])
20250325_200112_323


## Coarse-grained editing for finding pitch and pitch variation


In [77]:
average_creak = 0
average_cpps = 0
average_h1h2 = 0
average_h1a3 = 0
average_pitch = -1.5
average_pitch_var = 3

orig_args = arg_creator(source_path, target_path, outpath, average_creak, average_cpps, average_h1h2, average_pitch, average_h1a3, average_pitch_var)
converted_audio = convert(orig_args)

converting...
torch.Size([1, 1, 120])
torch.Size([1, 1, 119])
20250325_200112_953


## Run WhisperX
* We run speech recognition to transcribe the generated utterance
* We convert the timestamps from json to TextGrid in order to use prepare them for CreaPy 

In [78]:
#run whisper
import json
import os
from praatio import textgrid as tg
from praatio.utilities.constants import Interval

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

!whisperx "../game_outputs/{args[0]}" --model distil-medium.en --output_dir data/experiment --language en 

#read json file with the start and end times of the words
json_path = "data/experiment/" + args[0].split('/')[-1].replace(".wav", ".json")
tg_path = "data/experiment/" + source_path.split('/')[-1][:-4] + '.TextGrid'
timestamps = json_to_textgrid(json_path, tg_path)


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../miniconda3/envs/VQVC/lib/python3.10/site-packages/whisperx/assets/pytorch_model.bin`
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.0+cu121. Bad things might happen unless you revert torch to 1.x.
>>Performing transcription...
It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues

## Run CreapPy
This enables us to quantify and visualize the creak probability

In [79]:
X_test, y_pred, sr = creapy.process_file(textgrid_path=tg_path, audio_path=f"./data/experiment/{source_path.split('/')[-1]}")
X_test['h1h2'] = np.nan_to_num(X_test['h1h2'])
X_test['h1h2'] = np.convolve(X_test['h1h2'], np.ones(20)/20, mode='same')
y_pred_smoothed = np.convolve(y_pred, np.ones(20)/20, mode='same')


Mean of empty slice.


invalid value encountered in double_scalars


Series.ravel is deprecated. The underlying array is already 1D, so ravel is not necessary.  Use `to_numpy()` for conversion to a numpy array instead.



Wrote textgrid at /home/hfkml/VQVC/data/experiment/stay_let_go_12_alt_2.TextGrid


## Plot the audio
Get the durations from here for the fine-grained editing

In [73]:
fig = creapy.plot(X_test, y_pred_smoothed, sr, words=timestamps)

0.05099999999999999


## Fine-grained editing

Edit the features according to the following syntax:


You only need to change the times and feature values!

In [74]:
#combined_vq needs: args, pitch, pitch_var, audio

combined = combined_vq(args, -1.5, 0, converted_audio)


Breathiness segments: [((0.0, 0.3), 1.0)]
Creakiness segments: [((0, -1), 0)]
Nasality segments: [((0, -1), 0)]
-------------------------------------------------
Current average creak: -0.20000000298023224 
Current average CPPS: -0.10000000149011612 
Current average H1-H2: 0.30000001192092896 
Current average pitch: -1.5 
Current average H1-A3: 0.30000001192092896 
Current average pitch_var: 0 
-------------------------------------------------
converting...
torch.Size([1, 1, 150])
torch.Size([1, 1, 149])
20250325_200056_871


In [57]:
combined = combined_vq(args, -1.5, 0, converted_audio)


Breathiness segments: [((1.0, 1.5), 0.8), ((1.5, 2.0), 1.4)]
Creakiness segments: [((0.0, 0.4), 0.9)]
Nasality segments: [((1.6, 2.0), 0.7)]
-------------------------------------------------
Current average creak: -0.41791045665740967 
Current average CPPS: -0.5865671634674072 
Current average H1-H2: 1.0880597829818726 
Current average pitch: -1.5 
Current average H1-A3: 0.8373135328292847 
Current average pitch_var: 0 
-------------------------------------------------
converting...
torch.Size([1, 1, 134])
torch.Size([1, 1, 133])
Second dimensionality did not work: The size of tensor a (133) must match the size of tensor b (134) at non-singleton dimension 2
20250308_001047_201


In [56]:
average_creak = 0
average_cpps = 0
average_h1h2 = 0
average_h1a3 = 0
average_pitch = -1.5
average_pitch_var = 3

orig_args = arg_creator(source_path, target_path, outpath, average_creak, average_cpps, average_h1h2, average_pitch, average_h1a3, average_pitch_var)
converted_audio = convert(orig_args)

converting...
torch.Size([1, 1, 134])
torch.Size([1, 1, 133])
Second dimensionality did not work: The size of tensor a (134) must match the size of tensor b (133) at non-singleton dimension 2
20250308_000811_592



## Final values

1. 
average_creak = 0
average_cpps = 0
average_h1h2 = 0
average_h1a3 = 0
average_pitch = -1.5
average_pitch_var = 3

orig_args = arg_creator(source_path, target_path, outpath, average_creak, average_cpps, average_h1h2, average_pitch, average_h1a3, average_pitch_var)
converted_audio = convert(orig_args)

2. 
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 

3.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


4.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


5.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


6.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


7.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


8.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


9.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


10.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


11.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


12.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


13.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


14.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


15.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


16.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 


17.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var =  


18.
average_creak = 

average_cpps =  

average_h1h2 =  

average_h1a3 = 

average_pitch = 

average_pitch_var = 




In [None]:
breathy = [source_path, target_path, outpath, -1, -1, 2, -1.5, 2, 1]
creaky = [source_path, target_path, outpath, 2, -.5, -1, -2, -1, -2]
high_pitch_var = [source_path, target_path, outpath, -3, 0, -.5, -1.5, -1, 4]
tense = [source_path, target_path, outpath, -1, 1, 2, -1.5, -2, 0]
nasal = [source_path, target_path, outpath, -1, 1, -3, -1.5, 3, 0]

print('--------------Breathy----------------')
converted_breathy_audio = convert(breathy)
print('--------------Creaky----------------')
converted_creaky_audio = convert(creaky)
print('--------- Pitch-variation----------------')
converted_high_pitch_var_audio = convert(high_pitch_var)
print('--------------Tense----------------')
converted_tense_audio = convert(tense)
print('--------------Nasal----------------')
converted_nasal_audio = convert(nasal)
