# Import the relevant libraries

In [1]:
import pandas as pd
from creapy import creapy
import plotly
from pathlib import Path
import sys
import logging
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
import time
import librosa
import numpy as np
import torch
from scipy.io.wavfile import write
from tqdm import tqdm
import utils
from models import SynthesizerTrn
from speaker_encoder.voice_encoder import SpeakerEncoder
from wavlm import WavLM, WavLMConfig
from datetime import datetime
import IPython.display as ipd 
import json
from praatio import textgrid as tg
from praatio.utilities.constants import Interval
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## Load the models and helper functions
This includes setting location of the downloaded checkpoint off WavLM and the FreeVC model to be used

In [2]:
wavlm_large_path = 'wavlm/WavLM-Large.pt'
#freevc_chpt_path = '/nfs/deepspeech/home/lameris/libri_cpps/G_750000.pth'
#freevc_chpt_path = 'logs/libri_train_only/G_2322000.pth'
freevc_chpt_path = 'logs/libri_train_only/G_2344000.pth'
#creapy.set_config(creak_threshold=0.5)

def arg_creator(source, target, outpath, creak_values, cpps_values, h1h2_values, pitch_values, h1a3_values, pitch_var_values):
    return [source, target, outpath, creak_values, cpps_values, h1h2_values, pitch_values, h1a3_values, pitch_var_values]

def get_cmodel():
    checkpoint = torch.load(wavlm_large_path)
    cfg = WavLMConfig(checkpoint['cfg'])
    cmodel = WavLM(cfg)
    cmodel.load_state_dict(checkpoint['model'])
    cmodel.eval()

    return cmodel.to(device)

def generate_timestamp():
    # Get the current date and time
    now = datetime.now()

    # Format the date and time as a string suitable for a filename
    #timestamp = now.strftime("%Y%m%d_%H%M%S")
    timestamp = now.strftime("%Y%m%d_%H%M%S_") + str(int(now.microsecond / 1000)).zfill(2)
    # Return the formatted timestamp with the desired file extension
    return timestamp

def convert(args):
    print("converting...")
    wav_tgt1, _ = librosa.load(args[1], sr=hps.data.sampling_rate)
    wav_tgt1, _ = librosa.effects.trim(wav_tgt1, top_db=20)
    g_tgt1 = smodel.embed_utterance(wav_tgt1)
    g_tgt = torch.from_numpy(g_tgt1).unsqueeze(0).to(device)

    # src
    wav_src, _ = librosa.load(args[0], sr=hps.data.sampling_rate, mono=True)
    wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)

    c = utils.get_content(cmodel, wav_src)
    
    try:
        basic_tensor = torch.tensor(np.zeros((1, 1, (wav_src.size(1)//320-1)), dtype=np.float32), device=device)
        #tgt_audio = net_g.infer(c, g=g_tgt, creaks=basic_tensor+int(args[3]), cpps=basic_tensor+int(args[4]), h1h2s=basic_tensor+int(args[5]), pitches=basic_tensor+int(args[6]), h1a3s=basic_tensor+int(args[7]))
        tgt_audio = net_g.infer(
                        c,
                        g=g_tgt,
                        creaks=basic_tensor + args[3],
                        cpps=basic_tensor + args[4],
                        h1h2s=basic_tensor + args[5],
                        pitches=basic_tensor + args[6],
                        h1a3s=basic_tensor + args[7],
                        pitch_vars = basic_tensor + args[8]
                    )

    except RuntimeError:
        basic_tensor = torch.tensor(np.zeros((1, 1, (wav_src.size(1)//320))), dtype=torch.float32, device=device)
        tgt_audio = net_g.infer(
                        c,
                        g=g_tgt,
                        creaks=basic_tensor + args[3],
                        cpps=basic_tensor + args[4],
                        h1h2s=basic_tensor + args[5],
                        pitches=basic_tensor + args[6],
                        h1a3s=basic_tensor + args[7],
                        pitch_vars = basic_tensor + args[8]
                    ) 
    tgt_audio = tgt_audio[0][0].data.cpu().float().numpy()

    timestamp = generate_timestamp()
    print(timestamp)
    ipd.display(ipd.Audio(tgt_audio, rate=hps.data.sampling_rate))
    write(args[2], hps.data.sampling_rate, tgt_audio)
    #calculate cosine distance between speaker embedding of target and converted audio
    final_embedding = smodel.embed_utterance(tgt_audio)
    #torch cosine distance
    #print(g_tgt1.shape, final_embedding.shape)
    #cosine_distance = torch.nn.functional.cosine_similarity(torch.from_numpy(g_tgt1).unsqueeze(0), torch.from_numpy(final_embedding).unsqueeze(0))
    #print(cosine_distance)
 
    return tgt_audio

def create_creak_tensor(wav_len, creak_segments, sr):
    #create tensor torch.tensor(np.zeros((1, 1, (wav_src.size(1)//320-1))), dtype=torch.float32)) and then add the creak value to the tensor between the start and end frames
    #creak_start and creak_end are in seconds
    #start and end are in seconds

    #convert start and end to frames
    end_frame = (wav_len // 320)
    creak_frames = [(int(item[0] * sr // 320), int(item[1] * sr // 320), item[2]) for item in creak_segments]
    creaks = torch.tensor(np.zeros((1, 1, (end_frame))), dtype=torch.float32)
    for creak in creak_frames:
        creaks[0, 0, creak[0]:creak[1]] = creak[2]
    return creaks

def json_to_textgrid(json_path, textgrid_path):
    with open(json_path) as f:
        data = json.load(f)
    start_speech = data['segments'][0]['start']
    end_speech = data['segments'][-1]['end']
    timestamps = [data['segments'][i]['words'] for i in range(len(data['segments']))] 
    timestamps = [item for sublist in timestamps for item in sublist]
    textgrid = tg.Textgrid()
    tier = tg.IntervalTier('word', [], start_speech, end_speech)

    new_timestamps = []
    for i, timestamp in enumerate(timestamps):
        prev_timestamp = timestamps[i-1] if i > 0 else None
        if prev_timestamp:
            if prev_timestamp['end'] == timestamp['start']:
                start = prev_timestamp['end']
            else:
                start = prev_timestamp['end'] + 0.01
            end = timestamp['end']
            new_timestamps.append({'start': start, 'end': end, 'word': timestamp['word']})
            tier.insertEntry(Interval(start, end, timestamp['word']))
        else:
            tier.insertEntry(Interval(timestamp['start'], timestamp['end'], timestamp['word']))
            new_timestamps.append({'start': timestamp['start'], 'end': timestamp['end'], 'word': timestamp['word']})
    textgrid.addTier(tier)
    textgrid.save(textgrid_path, format='long_textgrid', includeBlankSpaces=False)
    return new_timestamps

hps = utils.get_hparams_from_file('configs/freevc.json')

net_g = SynthesizerTrn(
   hps.data.filter_length // 2 + 1,
   hps.train.segment_size // hps.data.hop_length,
   **hps.model
)

utils.load_checkpoint(freevc_chpt_path, net_g, optimizer=None, strict=True)
cmodel = get_cmodel()
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt', device=device)



INFO:root:Loaded checkpoint 'logs/libri_train_only/G_2344000.pth' (iteration 1444)
INFO:wavlm.WavLM:WavLM Config: {'extractor_mode': 'layer_norm', 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': 'gelu', 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'normalize': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.0, 'dropout_input': 0.0, 'dropout_features': 0.0, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': 'static', 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'relative_position_embedding': True, 'num_buckets': 320, 'm

## Set the arguments
Here we set the following arguments for the creaky voice conversion:

1. **source_path** indicates the audio file of which we want the linguistic content.
2. **target_path** indicates the audio file containing speech of the target speaker.
3. **outpath** is the location where the converted audio will be saved.
4. **average_creak** is the initial creak value over the complete utterance that will be supplied to the model 

We also create the output folder specified in the arguments


In [21]:
#source_path = "direct_part_2/left_direct.wav"
#source_path = "/home/lameris/CreakVC/libri_train/test/wavs/1334_135589_000053_000000.wav"
#source_path = "neutral_sentences/neutral_budget.wav"
#source_path = "source_audio_game-enhanced-v2.wav"
#source_path = "../game_outputs/game_before_1_5.wav"
#source_path = "../game_outputs/lava_boy_2_4.wav"
source_path = "../game_outputs/arrow_keys_3_3.wav"
source_path = "../game_outputs/opposite_4_4.wav"
source_path = "../game_outputs/into_fire_5_4.wav"
#source_path = "../game_outputs/walk_down_6_5.wav"
#target_path = "denoised/16000_female_target_end_enhanced.wav"
target_path = "denoised/male_low_creak_enhanced.wav"
target_path = "game_20_target-enhanced-v2.wav"
#target_path = "denoised/male_low_creak_enhanced.wav"
#target_path = "denoised/16000_joe_example_compare_enhanced.wav"
#target_path = "denoised/16000_male_voice_enhanced.wav"
#outpath = "audio_obj_creak/left_direct.wav"
outpath = './data/out/creaky_vctk_liam_cpp.wav'
average_creak = 0
average_cpps = -.5
average_h1h2 = 1
average_pitch = -1.5
average_h1a3 = 1
average_pitch_var = 1
# average_creak = 0
# average_cpps = .5
# average_h1h2 = 0
# average_pitch = -1.5
# average_h1a3 = 0
# average_pitch_var = -1.5
sr=16000
args = arg_creator(source_path, target_path, outpath, average_creak, average_cpps, average_h1h2, average_pitch, average_h1a3, average_pitch_var)
os.makedirs('/'.join(args[2].split('/')[:-1]), exist_ok=True)

#tense = [0, 10, -10, -1]

In [22]:
# #obj eval loop
# increments_cpps = [-2, -1, 0, 1, 2]

# #loop through all the values and create the audio files 
# for cpps in increments_cpps:
#     outpath = "audio_obj_creak/left_direct.wav"
#     new_filename = args[2].replace('.wav', f'_cpps_{cpps}.wav')
#     args = arg_creator(source_path, target_path, new_filename, average_creak, cpps, average_h1h2, average_pitch, average_h1a3)
#     convert(args)
#     #save the audio file with the value in the filename
print('temp')
    



temp


In [23]:
ipd.display(ipd.Audio(args[1], rate=16000))
ipd.display(ipd.Audio(args[0], rate=16000))


# Perform the conversion

In [24]:
creaky_audio = convert(args)
#select cuda:3 for the gpu


converting...
20250225_172410_120


In [15]:
ipd.display(ipd.Audio('source_audio_game-enhanced-v2.wav', rate=16000))

## Run WhisperX
* We run speech recognition to transcribe the generated utterance
* We convert the timestamps from json to TextGrid in order to use prepare them for CreaPy 

In [8]:
new_creak = create_creak_tensor(creaky_audio.shape[0], [(5, 6.5, 2)], sr)
args[-6] = new_creak.to(device)
new_cpps = create_creak_tensor(creaky_audio.shape[0], [(5, 6.3, 0)], sr)
args[-5] = new_cpps.to(device)
new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(5, 6.3, 0)], sr)
args[-2] = new_h1a3.to(device)
#new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(5, 6.3, -.5)], sr)
#args[-3] = new_h1h2.to(device)
new_pitch_var = create_creak_tensor(creaky_audio.shape[0], [(5, 6.5, -2)], sr)
args[-1] = new_pitch_var.to(device)

creaky_audio = convert(args)

converting...
20250225_171516_36


In [25]:
#run whisper
import json
import os
from praatio import textgrid as tg
from praatio.utilities.constants import Interval

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

!whisperx "../game_outputs/into_fire_5_4.wav" --model distil-medium.en --output_dir data/out/creaky_vctk_whisper --language en 

#read json file with the start and end times of the words
timestamps = json_to_textgrid('data/out/creaky_vctk_whisper/into_fire_5_4.json', 'data/out/creaky_vctk_whisper/creaky_vctk_liam_cpp.TextGrid')


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../miniconda3/envs/VQVC/lib/python3.10/site-packages/whisperx/assets/pytorch_model.bin`
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.0+cu121. Bad things might happen unless you revert torch to 1.x.
>>Performing transcription...
It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues

## Run CreapPy
This enables us to quantify and visualize the creak probability

In [26]:
X_test, y_pred, sr = creapy.process_file(textgrid_path='./data/out/creaky_vctk_whisper/creaky_vctk_liam_cpp.TextGrid', audio_path='./data/out/creaky_vctk_liam_cpp.wav')
#smoothen X_test['h1h2'] using np.convolve 
#replace nan in X_test['h1h2'] with 0
X_test['h1h2'] = np.nan_to_num(X_test['h1h2'])

X_test['h1h2'] = np.convolve(X_test['h1h2'], np.ones(20)/20, mode='same')

Wrote textgrid at /home/hfkml/VQVC/data/out/creaky_vctk_whisper/creaky_vctk_liam_cpp.TextGrid


## Smoothen the output

In [27]:
y_pred_smoothed = np.convolve(y_pred, np.ones(20)/20, mode='same')
print(y_pred.shape)
ipd.display(ipd.Audio(creaky_audio, rate=sr))


(459,)


## Plot the creak

In [None]:
fig = creapy.plot(X_test, y_pred_smoothed, sr, words=timestamps)

108
398
1.097


In [None]:
#summary neutral 
#new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.6, 1, -10), (2.2, 2.6, -5), (3, 3.9, -20)], sr)
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0, 2.5, 15), (2.5, 4.4, 5),], sr)
args[-5] = new_creak.to(device)
#room (0.8, 2, -10), (3.8, 4.4, -20)
new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 2.5, -4), (2.5, 4.4, -2)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 4.4, -2)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 4.4, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 4.4, -1)], sr)
args[-1] = new_h1a3.to(device)

creaky_audio = convert(args)

In [None]:
#summary neutral 
#new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.6, 1, -10), (2.2, 2.6, -5), (3, 3.9, -20)], sr)
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0, 3.9, 3)], sr)
args[-5] = new_creak.to(device)
#room (0.8, 2, -10), (3.8, 4.4, -20)
new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 3.9, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 3.9, -2)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 3.9, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 3.9, -2)], sr)
args[-1] = new_h1a3.to(device)
#name breathy
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1, 2.4, -5), (2,2.4, -15), (3, 4.7, -3), (4.1, 4.8, -20)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 4.7, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 4.7, 3)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 4.7, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 4.7, 2)], sr)
args[-1] = new_h1a3.to(device)
#budget creaky
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0, 3.4, 3), (0, .9, 5)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 3.4, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 3.4, -1)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 2.0, -1), (1.5, 3.4, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 3.4, -1)], sr)
args[-1] = new_h1a3.to(device)
#budget breathy
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.8, 1.2, -10), (1.8, 3.4, -5)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 3.4, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 3.4, 4)], sr)
args[-3] = new_h1h2.to(device)


new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 2.0, -1), (1.5, 3.4, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 3.4, 2)], sr)
args[-1] = new_h1a3.to(device)



#budget modal
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.8, 1.2, -10), (1.8, 3.4, -5)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0.8, 1.2, 0.5), (1.3, 3.4, .5)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0.8, 1.2, 0.5)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 2.0, -1), (1.5, 3.4, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0.8, 1.2, 0.5)], sr)
args[-1] = new_h1a3.to(device)
#neutral second one I think
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.0, 1.2, -10), (1.4, 4.2, 0), (3.9, 4.2, 0)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0.0, 1.2, 0.5), (1.5, 4.2, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0.0, 1.2, 0), (1.5, 4.2, 2)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 2.0, -1), (1.5, 4.1, -2)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0.0, 1.2, 0), (1.5, 4.2, 2.1)], sr)
args[-1] = new_h1a3.to(device)
#value negative creaky
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, -10), (1.9, 4.1, -5), (3.1, 4.1, -10)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, 1), (1.9, 4.1, .5)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (1.9, 4.1, .25)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 5.1, -1),(1.9, 4.1, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (1.9, 4.1, .25)], sr)
args[-1] = new_h1a3.to(device)

#other
new_creak = create_creak_tensor(creaky_audio.shape[0], [(.5, 1, -10), (1.8, 2.2, -10), (3.1, 5.9, 1)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(.5, 1, .5), (1.8, 2.2, .5), (3.1, 5.9, -.5)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(.5, 1, .5), (1.8, 2.2, .5), (3.1, 5.9, -1)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 5.1, -1),(3.1, 5.9, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(.5, 1, .5), (1.8, 2.2, .5), (3.1, 5.9, -1)], sr)
args[-1] = new_h1a3.to(device)

#disappointed negative creaky
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, -10), (1.9, 4.1, 1), (3.1, 4.1, 0)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, 1), (1.9, 4.1, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (1.9, 4.1, -1)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 5.1, -1),(1.9, 4.1, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (1.9, 4.1, -1)], sr)
args[-1] = new_h1a3.to(device)
#creaky second try myself
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, -10), (1.9, 5.1, 5), (4.6, 5.1, 2)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, 1), (1.9, 5.1, -2), (4.6, 5.1, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (1.9, 5.1, -2), (4.6, 5.1, .5)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 5.1, -1),(1.9, 5.1, -1.5), (2.5, 3, -2)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (1.9, 5.1, -1.5), (4.6, 5.1, .5)], sr)
args[-1] = new_h1a3.to(device)
#myself positive creaky
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, -10), (1.9, 5.1, -1)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, 1), (1.9, 5.1, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (1.9, 5.1, 2.5), (1.9, 3, 3)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 5.1, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (1.9, 5.1, 2.5), (1.9, 3, 3.5)], sr)
args[-1] = new_h1a3.to(device)
#care negative modal
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, -10), (3.1, 5.1, -5), (4.6, 5.1, -10)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, 1), (3.1, 5.1, .5), (4.6, 5.1, 2)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (3.1, 5.1, .5), (4.6, 5.1, 1)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 5.1, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (3.1, 5.1, .5), (4.6, 5.1, .5)], sr)
args[-1] = new_h1a3.to(device)
#care negative creaky
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, -10), (3.1, 3.9, 6), (3.9, 4.9, 5)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, 1), (3.1, 4.9, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.8, 0.5), (3.1, 4.9, -2)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(2, 2.9, 0), (3.1, 4.9, -2), (4.3, 4.6, -2.5)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.8, .5), (3.1, 4.9, -2)], sr)
args[-1] = new_h1a3.to(device)

#care negative breathy
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, -10), (4.5, 4.9, 0)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, 1), (3.1, 4.9, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.8, 0.5), (3.1, 4.9, 2)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(2, 2.9, 0), (3.1, 4.9, 0)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.8, .5), (3.1, 4.9, 2)], sr)
args[-1] = new_h1a3.to(device)
#care negative modal
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, -10), (4.5, 4.9, -15)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, 1), (4.5, 4.9, 1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.8, 0.5), (4.5, 4.9, .5)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(2, 2.9, 0), (4.5, 4.9, 0)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.8, .5), (4.5, 4.9, .5)], sr)
args[-1] = new_h1a3.to(device)

#handled positive breathy extra and variation
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.1, -15), (2.4, 3.8, 0)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 2.5, 0), (2.5, 3.9, -1)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 2.5, 0), (2.5, 3.9, 2)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 2.5, -1), (2.5, 3.9, .5)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 2.5, 0), (2.5, 3.9, 2)], sr)
args[-1] = new_h1a3.to(device)

creaky_audio = convert(args)
#handled positive creaky
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.1, -15), (2.4, 3.8, 5)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 2.5, 0), (2.5, 3.9, -1.5)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 2.5, 0), (2.5, 3.9, -1)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 3.9, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 2.5, 0), (2.5, 3.9, -1)], sr)
args[-1] = new_h1a3.to(device)
#burnt negative creaky
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.6, 1.1, -10), (3.2, 4.1, 10), (4.1, 5.3, 5)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(3.3, 5.3, -3)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(3.3, 5.3, -1)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(3.3, 5.3, -2)], sr)
args[-2] = new_pitch.to(device)
#args[-2] = 0

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(3.3, 5.3, -1)], sr)
args[-1] = new_h1a3.to(device)
#burnt negative breathy
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.6, 1.1, -10), (4.8, 5.1, -5)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(3.3, 5.3, -.5)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(3.3, 5.3, 2)], sr)
args[-3] = new_h1h2.to(device)

#new_pitch = create_creak_tensor(creaky_audio.shape[0], [(3.3, 5.3, -.5)], sr)
#args[-2] = new_pitch.to(device)
args[-2] = 0

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(3.3, 5.3, 3)], sr)
args[-1] = new_h1a3.to(device)
#burnt negative modal
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.6, 1.1, -10), (4.3, 4.9, -10), (4.9, 5.1, -15), (5.1, 5.3, -25)], sr)
args[-5] = new_creak.to(device)

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 5.3,.5)], sr)
args[-4] = new_cpps.to(device)

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 5.3,0)], sr)
args[-3] = new_h1h2.to(device)

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(3.3, 5.3, -1)], sr)
args[-2] = new_pitch.to(device)

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 5.3,0)], sr)
args[-1] = new_h1a3.to(device)

In [None]:
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.6, 2.5)], sr)

args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0.8, 1, 1), (1.5, 2.6, -1)], sr)
args[-4] = new_cpps

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.6, -1)], sr)
args[-3] = new_h1h2

#new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.85, -.5)], sr)
#args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.6, -1)], sr)
args[-1] = new_h1a3


creaky_audio = convert(args)
#relax
new_creak = create_creak_tensor(creaky_audio.shape[0], [(2.1, 3, 4), (3, 4, 1)], sr)

args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(2.1, 3, -2), (3, 4, -1)], sr)
args[-4] = new_cpps

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(2.1, 3, -2), (3, 4 , 0)], sr)
args[-3] = new_h1h2

# new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.85, -.5)], sr)
# args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(2.1, 3, -2), (3, 4 , 0)], sr)
args[-1] = new_h1a3
#time
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.85, 2.5)], sr)

args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.85, -1), (2.2, 2.85, 0)], sr)
args[-4] = new_cpps

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.85, -1), (2.2, 2.85, 0)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.85, -.5)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 1.4, .5), (1.5, 2.85, -1), (2.2, 2.85, 0)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
#thumbs
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0, 1.4, -5), (1.5, 2.2, 5),(2.2, 2.6, 0)], sr)

args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 1.4, 2), (1.5, 2.4, -1)], sr)
args[-4] = new_cpps

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 1.4, .5), (1.5, 2.4, -1)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.4, -1)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 1.4, .5), (1.5, 2.4, -1)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
#steps
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.5, 1.2, 0), (1.4, 3.1, 2),(1.4, 2.4, 5)], sr)

args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.4, 3.1, -2)], sr)
args[-4] = new_cpps

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.4, 3.1, -.5)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.3, 3.1, -1)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.4, 3.1, -.5)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
#planned
new_creak = create_creak_tensor(creaky_audio.shape[0], [(2.0, 3.6, 3)], sr)

args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(2.0, 3.6, -1.5)], sr)
args[-4] = new_cpps

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(2.0, 3.6, -1)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(2, 3.6, 0)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(2, 3.6, -1)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
#help
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.4, 1.2, -20), (1.4, 1.5, 5), (1.5, 2.7, 3)], sr)
#(2.5, 3.3, -15)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0.4, 1.2, 1), (1.4, 2.7, -2)], sr)
args[-4] = new_cpps

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0.4, 1.2, 1), (1.4, 2.7, -1)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0.4, 1.2, .2),(1.4, 1.6, -.5)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0.4, 1.2, 1), (1.4, 2.7, -1)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
#happened
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.6, 3),(2.1, 2.6, 3), (2.4, 2.6, 1)], sr)
#(2.5, 3.3, -15)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.6, -1.5),(1.5, 1.8, -1.5), (2.4, 2.6, -1)], sr)
args[-4] = new_cpps

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.6, -1)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2, -1)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.5, 2.6, -1)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
#clearing
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.8, 1.2, -10),(1.2, 2.65, 5), (2.4, 2.65, 2)], sr)
#(2.5, 3.3, -15)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.2, 2.65, -3), (2.4, 2.65, -2)], sr)
args[-4] = new_cpps

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.2, 2.65, -2), (2.4, 2.65, -1)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.2, 2.65, -1.5)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.2, 2.65, -2), (2.4, 2.65, -1)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
#all
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, 3)], sr)
#(2.5, 3.3, -15)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, -.5)], sr)
args[-4] = new_cpps

new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, -1)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, -1)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.6, 2.9, -1)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
#after 
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.6, 1.1, -10), (1.7, 2.2, 3),(2.2, 3.3, 2) ], sr)
#(2.5, 3.3, -15)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0.6, 1.1, 1), (1.7, 2.2, -2), (2.2, 3.3, -1)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.7, 3.3, -.5)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.7, 2.2, -2)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.7, 3.3, -.5)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)

In [None]:
new_creak = create_creak_tensor(creaky_audio.shape[0], [(2.1, 3.6, 2)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(2.1, 3.6, -1)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(2.1, 3.6, -2)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, 0)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(2.1, 3.6, -2)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, -10), (3.8, 4.1, -20)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, 2)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, 0)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, 0)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, 0)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)

In [None]:
#suddenly direct
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0, 1.2, -10), (0.7, 1, -15),(1.8, 4, 1),(3.9, 4.2, 0)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 1.2, 1), (1.8, 4, -.5)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 1.2, .5), (1.8, 4, -2)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 1.2, .5), (1.8, 4, -.5)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 1.2, .5), (1.8, 4, -2)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
#suddenly direct
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.7, 4.2, 5), (2.2, 2.7, 5),(3, 4.2, 5)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.7, 4.2, -1.5), (3, 4.2, -1)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.7, 4.2, -2), (3, 4.2, -1)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.7, 4.2, -1)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.7, 4.2, -2), (3, 4.2, -1)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
# started direct
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.7, 2.4, 5), (1.7, 2.2, 12), (2.3, 2.7, 5)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.7, 2.4, -1),(1.7, 2.0, -3),(2.3, 2.7, -1)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.7, 2.4, -1),(2.3, 2.7, -1)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.7, 2.7, -1)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.7, 2.4, -1),(2.3, 2.7, -1)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)
# started zero
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.1, 1.24, 0), (1.25, 1.9, 10), (1.9, 2.3, 3)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.1, 1.24,1),(1.25, 1.9, -2), (1.9, 2.3, -1)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.1, 1.24, 0), (1.25, 1.9, -1), (1.9, 2.3, -1)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.1, 1.9, -.5), (1.9, 2.3, 0)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.1, 1.24, 0), (1.25, 1.9, -1), (1.9, 2.3, -1)], sr)
args[-1] = new_h1a3

creaky_audio = convert(args)

# #right direct / room zero
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, -10)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, -2)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, 2)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, 0.2)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.8, 4.4, 2)], sr)
args[-1] = new_h1a3

#right zero
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.4, 3.5, -10)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.4, 3.5, 2)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.4, 3.5, 2)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.4, 3.5, 0.5)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.4, 3.5, 2)], sr)
args[-1] = new_h1a3

new_creaky_audio = convert(args)

#phone zero
new_creak = create_creak_tensor(creaky_audio.shape[0], [(.4, .8, -10), (1.4, 1.7, -5), (2.1, 2.9, -15), (2.9, 3.3, -20)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(.4, .8, 2), (1.4, 1.7, 1), (2.1, 2.9, -3)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(.4, .8, 1), (1.4, 1.7, 0), (2.1, 2.9, 3)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(.4, .8, .2), (2.1, 3.1, 0.2)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(.4, .8, 1), (1.4, 1.7, 1), (2.1, 2.9, 3)], sr)
args[-1] = new_h1a3

new_creaky_audio = convert(args)
#late direct
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, -5), (1.5, 3.3, -5)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, 1), (1.5, 3.3,-2), (1.7, 2.4, -3)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (1.5, 3.3, -2), (1.7, 2.4, -3)], sr)
args[-3] = new_h1h2

#new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.7, 3.5, 0.0)], sr)
#args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.0, 1.3, .5), (1.5, 3.3, -2), (1.7, 2.4, -3)], sr)
args[-1] = new_h1a3

new_creaky_audio = convert(args)
# #late zero
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.3, 1.0, -2), (1.3, 2.1, 0)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0.3, 1.0, 1), (1.3, 2.1, -2), (1.4, 1.6, -2)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0.3, 1.0, 1),(1.3, 2.1, 1)], sr)
args[-3] = new_h1h2

#new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.7, 3.5, 0.0)], sr)
#args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0.3, 1.0, 1), (1.3, 2.1, 3)], sr)
args[-1] = new_h1a3

new_creaky_audio = convert(args)
# late direct
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.9, 1.3, -8), (1.8, 2.4, -5)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0.9, 1.3, 1), (1.8, 2.4, -1)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0.9, 1.3, 1), (1.8, 2.4, 2)], sr)
args[-3] = new_h1h2

#new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.7, 3.5, 0.0)], sr)
#args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0.9, 1.3, 1), (1.8, 2.4, 2)], sr)
args[-1] = new_h1a3

new_creaky_audio = convert(args)
#food direct
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.3, 0.5, -8), (0.6, 2, -8), (2.4, 3.3, -5)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0.6, 2, 1), (2.4, 3.3, 0)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0.6, 2, 0), (2.4, 3.3, 2.5)], sr)
args[-3] = new_h1h2

#new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.7, 3.5, 0.0)], sr)
#args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0.6, 2, 0), (2.4, 3.3, 3)], sr)
args[-1] = new_h1a3

new_creaky_audio = convert(args)
#Food zero creaky
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.2, 1.6, -10), (1.7, 3.6, 4), (2.1, 2.7, 8)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 1.6, 1),(1.7, 3.5, -1)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 1.6, 1), (1.7, 3.5, -1)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(0, 3.5, 0.0)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 1.6, 1), (1.7, 3.5, -1)], sr)
args[-1] = new_h1a3

new_creaky_audio = convert(args)

#modal 
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0, 2.8, -5), (0.7, 1.5, -10)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 2.8, 1), (0.7, 1.5, 2)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 2.8, 1),(0.7, 1.5, 2)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.7, 3.5, 0.0)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 2.8, 1), (0.7, 1.5, 2)], sr)
args[-1] = new_h1a3

new_creaky_audio = convert(args)

#breathy 
new_creak = create_creak_tensor(creaky_audio.shape[0], [(0.7, 1.4, -10), (1.4, 2.9, -5)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 1.4, 1), (1.4, 2.9, -2)], sr)
args[-4] = new_cpps



new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(0, 1.4, 1),(1.4, 2.9, 3.2)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.7, 3.5, 0)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(0, 1.4, 1), (1.4, 2.9, 3.2)], sr)
args[-1] = new_h1a3

new_creaky_audio = convert(args)

In [None]:

#creaky
new_creak = create_creak_tensor(creaky_audio.shape[0], [(1.2, 1.6, -5), (1.5, 2.4, 0), (2.4, 3.2, -5)], sr)
args[-5] = new_creak

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(0, 1.6, 1), (1.5, 3.2, -.5)], sr)
args[-4] = new_cpps


new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(1.5, 3.2, 2)], sr)
args[-3] = new_h1h2

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(1.6, 3.2, 0)], sr)
args[-2] = new_pitch

new_h1a3 = create_creak_tensor(creaky_audio.shape[0], [(1.5, 3.2, 2)], sr)
args[-1] = new_h1a3

new_creaky_audio = convert(args)

In [None]:
#new_creaks = create_creak_tensor(creaky_audio.shape[0], [(0.589, 0.81, -4), (0.81, 1.04, -10), (1.19, 1.49, -14), (1.87, 2.0, -10), (2.0, 2.98, 5)], sr)
new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(2.3, 4.8, -2)], sr)
args[-2] = new_h1h2

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(2.3, 4.8, -2)], sr)
args[-3] = new_cpps

new_creak = create_creak_tensor(creaky_audio.shape[0], [(2.3, 4.8, 15)], sr)
args[-4] = new_creak

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(2.3, 4.8, 0)], sr)
args[-1] = new_pitch

new_creaky_audio = convert(args)

#breathy = [5, -10, 10, 0]


In [None]:
#new_creaks = create_creak_tensor(creaky_audio.shape[0], [(0.589, 0.81, -4), (0.81, 1.04, -10), (1.19, 1.49, -14), (1.87, 2.0, -10), (2.0, 2.98, 5)], sr)
new_h1h2 = create_creak_tensor(creaky_audio.shape[0], [(5.3, 7.5, 10), (7.5, 9, 5), (9, 10.3, 10)], sr)
args[-2] = new_h1h2

new_cpps = create_creak_tensor(creaky_audio.shape[0], [(5.3, 7.5, -15), (7.5, 9, -10), (9, 10., -16)], sr)
#args[-3] = new_cpps

new_creak = create_creak_tensor(creaky_audio.shape[0], [(5.3, 7.5, 10), (7.5, 9, 10), (9, 10, 10)], sr)
args[-4] = new_creak

new_pitch = create_creak_tensor(creaky_audio.shape[0], [(5.7, 10, -.5)], sr)
args[-1] = new_pitch

new_creaky_audio = convert(args)
#print(args)

In [None]:
#run creapy on output audio
X_test, y_pred, sr, = creapy.process_file(textgrid_path='./data/out/creaky_vctk_whisper/creaky_vctk_liam_cpp.TextGrid', audio_path='./data/out/creaky_vctk_liam_cpp.wav')
#smoothen X_test['h1h2'] using np.convolve
#replace nan in X_test['h1h2'] with 0
X_test['h1h2'] = np.nan_to_num(X_test['h1h2'])
X_test['h1h2'] = np.convolve(X_test['h1h2'], np.ones(20)/20, mode='same')

y_pred_smoothed = np.convolve(y_pred, np.ones(20)/20, mode='same')
fig2 = creapy.plot(X_test, y_pred_smoothed, sr, words=timestamps)

In [None]:
#run creapy on output audio
X_test, y_pred, sr, = creapy.process_file(textgrid_path='./data/out/creaky_vctk_whisper/creaky_vctk_liam.TextGrid', audio_path='./data/out/creaky_vctk_liam.wav')
y_pred_smoothed = np.convolve(y_pred, np.ones(20)/20, mode='same')
fig2 = creapy.plot(X_test, y_pred_smoothed, sr, words=timestamps)
