In [None]:
%%capture
!pip install gdown
!pip install TTS
!sudo apt-get install espeak-ng -y
!sudo apt-get install unzip
!gdown "https://drive.google.com/uc?id=your_id"
!unzip /teamspace/studios/this_studio/data.zip -d /teamspace/studios/this_studio/data

In [None]:
from huggingface_hub import snapshot_download
local_data_dir = "/teamspace/studios/this_studio/model/male_vits_23_dec_2024-December-23-2024_09+14AM-0000000"
dataset_repo = "sifat1221/vits_bn_tts_checkpoint_45000"
file_path = snapshot_download(repo_id=dataset_repo, local_dir=local_data_dir)

In [1]:
import os
folder_path = "/teamspace/studios/this_studio/data/data/wav"
file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
print(f"Number of files in '{folder_path}': {file_count}")

Number of files in '/teamspace/studios/this_studio/data/data/wav': 17670


In [2]:
import numpy as np
import pandas as pd
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.utils.audio import AudioProcessor
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.models.vits import Vits, VitsAudioConfig, CharactersConfig

In [3]:
if "COLAB_GPU" in os.environ:
    from google.colab import drive
    drive.mount('/content/drive')
    data_root = '/content/drive/MyDrive/IIT/4-2/ML/ML-Project_Team_minions/data'
    model_root = '/content/drive/MyDrive/IIT/4-2/ML/ML-Project_Team_minions/Models'
    
elif os.path.exists("/kaggle"):
    print("Running in Kaggle")
    data_root = '/kaggle/input/bangla-audio/data'
    model_root = '/kaggle/working/'

else:
    print("Running in Lightening")
    data_root = '/teamspace/studios/this_studio/data/data'
    model_root = '/teamspace/studios/this_studio/model'


Running in Lightening


In [4]:
male = True
pretrained = True
pretrained_path = ""
if(pretrained):
    pretrained_path = '/teamspace/studios/this_studio/model/male_vits_23_dec_2024-December-23-2024_09+14AM-0000000'
    
if(male):
    meta_file = f'{data_root}/metadata.csv'
    root_path = f'{data_root}'
else:
    meta_file = f'{data_root}/female/mono/metadata_female.txt'
    root_path = f'{data_root}/female/mono'

In [5]:
def formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    txt_file = meta_file
    items = []
    speaker_name = "ljspeech"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.join(root_path, "wav", cols[0] + ".wav")
            print("wav_file:",wav_file)
            text = ''
            try:
                text = cols[2]
                print("text:",text)
            except:
                print("not found")
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
    return items

In [6]:
dataset_config = BaseDatasetConfig(meta_file_train=meta_file, path=os.path.join(root_path, ""))

train_samples, eval_samples = load_tts_samples(dataset_config,formatter=formatter, eval_split=True, eval_split_size=0.2)
# print(help(load_tts_samples))

print(len(train_samples),len(eval_samples))

wav_file: /teamspace/studios/this_studio/data/data/wav/01001.wav
text: উনিশ মে দুপুর তিনটায় হোটেল সুন্দরবনে আপনার একটি রিজার্ভেশন আছে।

wav_file: /teamspace/studios/this_studio/data/data/wav/01002.wav
text: শেখ হাসিনা বাংলাদেশ আওয়ামী লীগের সভানেত্রী।

wav_file: /teamspace/studios/this_studio/data/data/wav/01003.wav
text: আপনার বাড়ি এবং আপনার কাজের জায়গার মধ্যে ট্রাফিক দেখানো হচ্ছে ।

wav_file: /teamspace/studios/this_studio/data/data/wav/01004.wav
text: জাহানারা ইমামের জন্ম উনিশ শত উনত্রিশ সালের তিন মে বর্তমান পশ্চিমবঙ্গের মুর্শিদাবাদ জেলায়।

wav_file: /teamspace/studios/this_studio/data/data/wav/01005.wav
text: এইগুলো সিক্স ফ্ল্যাগস এর কয়েকটি রাইড।

wav_file: /teamspace/studios/this_studio/data/data/wav/01006.wav
text: আপনি শেষ হবার সময় বলতে পারেন উদাহরণস্বরূপ ছয় পি এম।

wav_file: /teamspace/studios/this_studio/data/data/wav/01007.wav
text: মুহম্মদ জাফর ইকবাল একজন বাংলাদেশী লেখক, পদার্থবিদ ও শিক্ষাবিদ।

wav_file: /teamspace/studios/this_studio/data/data/wav/01008.wav
text: এই 

In [7]:
output_path = model_root
phoneme_cache_path = os.path.join(output_path, "phoneme_cache")
os.makedirs(output_path,exist_ok=True)


audio_config = VitsAudioConfig(sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None)
config = VitsConfig(
    audio=audio_config,
    run_name="male_vits_23_dec_2024",
    batch_size=48,
    eval_batch_size=32,
    epochs=1000,
    save_step=5000,
    print_step=500,
    batch_group_size=0,
    num_loader_workers=8,
    num_eval_loader_workers=8,
    run_eval=True,
    test_delay_epochs=-1,
    phonemizer="espeak",
    text_cleaner='phoneme_cleaners', #'multilingual_cleaners', #"collapse_whitespace" phoneme_cleaners multilingual_cleaners
    use_phonemes=True,
    phoneme_language="bn",
    phoneme_cache_path=phoneme_cache_path,
    compute_input_seq_cache=True,
    add_blank=True,
    use_language_weighted_sampler = True,
    print_eval=False,
    mixed_precision=False,
    output_path=output_path,
    datasets=[dataset_config],
    # characters = characters_config,
    cudnn_benchmark=True,
    test_sentences = [
        'আমার   সোনার বাংলা, আমি তোমায় ভালোবাসি।',
        'চিরদিন   তোমার আকাশ, তোমার বাতাস, আমার প্রাণে বাজায় বাঁশি',
        'ও মা,   ফাগুনে তোর আমের বনে ঘ্রাণে পাগল করে,মরি হায়, হায় রে।'
    ]
)

ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)
model = Vits(config, ap, tokenizer, speaker_manager=None)
trainer = Trainer(TrainerArgs(continue_path = pretrained_path), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 16
 | > Num. of Torch Threads: 8
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: True
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/teamspace/studios/this_studio/model/male_vits_23_dec_2024-December-23-2024_09+14AM-0000000
 > Restoring from best_model_17967.pth ...
  return torch.load(f, map_location=map_location, **kwargs)
 > Restoring Model...
 > Restoring Optimizer...
 > Model restored from step 17967
  self.scaler = torch.cuda.amp.GradScaler()

 > Model has 83059180 parameters


In [8]:
%%time
trainer.fit()

 > Restoring best loss from best_model_17967.pth ...
 > Starting with loaded last best loss {'train_loss': 15.75434464175685, 'eval_loss': 15.74705525745045}

[4m[1m > EPOCH: 0/1000[0m
 --> /teamspace/studios/this_studio/model/male_vits_23_dec_2024-December-23-2024_09+14AM-0000000


[*] Pre-computing phonemes...


  0%|          | 8/14136 [00:00<03:10, 74.18it/s]

ˌei nˈiɾbatʃˌɔn ˈupɔɟˌela bˈa ɟˈela bʰˈittik hˈɔbe
 [!] Character 'ʰ' not found in the vocabulary. Discarding it.


  0%|          | 23/14136 [00:00<04:23, 53.58it/s]

ˈũki dˈie dˈekʰɔlˌam mˈɔjɔlˌaɾ bˈakʃeɾ pˈaʃeɾ tʃʰˈoʈʈɔ ɟˈajɔɡˌaʈajˌo ˈaɟ ke kˈi fˈele ɡˈel! ˈɔbʰæʃ, dʒˈodiˌoː ˌaɾ kˈitʃʰu lˈaɡɔbˌe nˈa
 [!] Character '̃' not found in the vocabulary. Discarding it.


  3%|▎         | 459/14136 [00:09<05:12, 43.80it/s]

nˈoɟibˌɔullˌaho mˈatʰa tʃˈulɔkˌe bˈɔlɔl, hˈɔjo ʃˈotːi kˈɔtʰa bˈɔlɔtˌe hˈɔbe”dˈɔɳɖɔ
 [!] Character '”' not found in the vocabulary. Discarding it.


 47%|████▋     | 6625/14136 [02:08<02:50, 44.18it/s]

tˌaɾ kˈatʃʰe pɾˈɔʃnɔ tʃʰˌil — ˈek ʈɾˈak kˈãtʃa tˈɔɾɔkˌaɾi ɡɾˈameɾ bˈaɟaɾ tʰˈeke ɖʰˈakajˌo ˈanɔtˌe kˈɔto ʈˈaka tʃˈãda dˈite hˈɔjo?
 [!] Character '—' not found in the vocabulary. Discarding it.


100%|██████████| 14136/14136 [04:44<00:00, 49.77it/s]

[1m > TRAINING (2024-12-24 06:17:35) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: bn
		| > phoneme backend: espeak
	| > 4 not found characters:
	| > ʰ
	| > ̃
	| > ”
	| > —
| > Number of instances : 14136
 | > Preprocessing samples
 | > Max text length: 182
 | > Min text length: 10
 | > Avg text length: 68.06840690435767
 | 
 | > Max audio length: 374431.0
 | > Min audio length: 10539.0
 | > Avg audio length: 134871.0290039615
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:873.)
  return _VF.stft(  # type: ignore[attr-defined]
  with autocast(enabled=False):  # use float32 for the criterion
  with autocast(enabled=False):
  with autocast(enabled=False):  # use float32 for the criterion

[1m   --> TIME: 2024-12-24 06:19:00 -- STEP: 32/295 -- GLOBAL_STEP: 18000[0m
     | > loss_disc: 2.638986825942993  (2.688685715198517)
     | > loss_disc_real_0: 0.14179903268814087  (0.19451228203251958)
     | > loss_disc_real_1: 0.20070795714855194  (0.2364151389338076)
     | > loss_disc_real_2: 0.21623851358890533  (0.23388373339548707)
     | > loss_disc_real_3: 0.22530348598957062  (0.23234305949881673)
     | > loss_disc_real_4: 0.22967204451560974  (0.22832672949880362)
     | > loss_disc_real_5: 0.2262163907289505  (0.22992638079449534)
     | > loss_0: 2.638986825942993  (2.688685715198517)
     



> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: bn
		| > phoneme backend: espeak
	| > 4 not found characters:
	| > ʰ
	| > ̃
	| > ”
	| > —
| > Number of instances : 3534
 | > Preprocessing samples
 | > Max text length: 167
 | > Min text length: 10
 | > Avg text length: 67.07923033389926
 | 
 | > Max audio length: 369866.0
 | > Min audio length: 16934.0
 | > Avg audio length: 132509.87973967177
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.


  test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)

  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.010606702891263092 [0m(+0)
     | > avg_loss_disc: 2.6783349882472645 [0m(+0)
     | > avg_loss_disc_real_0: 0.12841068709438494 [0m(+0)
     | > avg_loss_disc_real_1: 0.22357474132017655 [0m(+0)
     | > avg_loss_disc_real_2: 0.21483484642072156 [0m(+0)
     | > avg_loss_disc_real_3: 0.2492537899450824 [0m(+0)
     | > avg_loss_disc_real_4: 0.2530605597929521 [0m(+0)
     | > avg_loss_disc_real_5: 0.25577399297194037 [0m(+0)
     | > avg_loss_0: 2.6783349882472645 [0m(+0)
     | > avg_loss_gen: 1.8443484317172658 [0m(+0)
     | > avg_loss_kl: 1.8072612263939598 [0m(+0)
     | > avg_loss_feat: 4.106909225203776 [0m(+0)
     | > avg_loss_mel: 19.586368352716622 [0m(+0)
     | > avg_loss_duration: 1.622936460104855 [0m(+0)
     | > avg_loss_1: 28.967823808843438 [0m(+0)


[4m[1m > EPOCH: 1/1000[0m
 --> /teamspace/studios/

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.008908718282526188 [0m(-0.0016979846087369042)
     | > avg_loss_disc:[91m 2.715711784362793 [0m(+0.03737679611552869)
     | > avg_loss_disc_real_0:[92m 0.1239762550727888 [0m(-0.004434432021596144)
     | > avg_loss_disc_real_1:[91m 0.2923028593713587 [0m(+0.06872811805118215)
     | > avg_loss_disc_real_2:[91m 0.24742040227759968 [0m(+0.03258555585687811)
     | > avg_loss_disc_real_3:[92m 0.2113635241985321 [0m(-0.0378902657465503)
     | > avg_loss_disc_real_4:[91m 0.26320134171030746 [0m(+0.010140781917355346)
     | > avg_loss_disc_real_5:[92m 0.2207076905803247 [0m(-0.03506630239161568)
     | > avg_loss_0:[91m 2.715711784362793 [0m(+0.03737679611552869)
     | > avg_loss_gen:[91m 1.8672880606217819 [0m(+0.022939628904516063)
     | > avg_loss_kl:[91m 1.8207757180387323 [0m(+0.013514491644772475)
     | > avg_loss_feat:[91m 4.259733098203482 [0m(+0.15282387299970512)
     | > avg_loss_mel:[

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.008983165567571463 [0m(+7.444728504527563e-05)
     | > avg_loss_disc:[92m 2.655404145067388 [0m(-0.06030763929540495)
     | > avg_loss_disc_real_0:[91m 0.1683616762811487 [0m(+0.04438542120835991)
     | > avg_loss_disc_real_1:[92m 0.23201455880295146 [0m(-0.06028830056840723)
     | > avg_loss_disc_real_2:[91m 0.25125553905963893 [0m(+0.0038351367820392557)
     | > avg_loss_disc_real_3:[91m 0.258006124604832 [0m(+0.04664260040629992)
     | > avg_loss_disc_real_4:[92m 0.2166108097542416 [0m(-0.046590531956065856)
     | > avg_loss_disc_real_5:[91m 0.23500143899158998 [0m(+0.014293748411265295)
     | > avg_loss_0:[92m 2.655404145067388 [0m(-0.06030763929540495)
     | > avg_loss_gen:[91m 1.909628532149575 [0m(+0.04234047152779308)
     | > avg_loss_kl:[92m 1.7621268727562645 [0m(-0.05864884528246783)
     | > avg_loss_feat:[92m 4.200165579535745 [0m(-0.059567518667736685)
     | > avg_loss_mel:

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.008949750119989568 [0m(-3.341544758189534e-05)
     | > avg_loss_disc:[91m 2.698574920134112 [0m(+0.04317077506672362)
     | > avg_loss_disc_real_0:[92m 0.16159157427874477 [0m(-0.006770102002403938)
     | > avg_loss_disc_real_1:[91m 0.24313294399868357 [0m(+0.011118385195732106)
     | > avg_loss_disc_real_2:[91m 0.278613265265118 [0m(+0.02735772620547905)
     | > avg_loss_disc_real_3:[92m 0.2462712775577199 [0m(-0.011734847047112112)
     | > avg_loss_disc_real_4:[92m 0.20899342637170445 [0m(-0.007617383382537146)
     | > avg_loss_disc_real_5:[91m 0.2411687432364984 [0m(+0.0061673042449084114)
     | > avg_loss_0:[91m 2.698574920134112 [0m(+0.04317077506672362)
     | > avg_loss_gen:[91m 1.9100138826803728 [0m(+0.00038535053079780113)
     | > avg_loss_kl:[91m 1.824922427264127 [0m(+0.06279555450786245)
     | > avg_loss_feat:[92m 4.162331340529704 [0m(-0.03783423900604088)
     | > avg_loss_m

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.008839889006181198 [0m(-0.00010986111380837)
     | > avg_loss_disc:[92m 2.632109479470687 [0m(-0.06646544066342486)
     | > avg_loss_disc_real_0:[92m 0.1386160284958103 [0m(-0.022975545782934453)
     | > avg_loss_disc_real_1:[91m 0.2671734448183665 [0m(+0.024040500819682953)
     | > avg_loss_disc_real_2:[92m 0.23627768849784678 [0m(-0.042335576767271205)
     | > avg_loss_disc_real_3:[91m 0.26490896398370933 [0m(+0.018637686425989425)
     | > avg_loss_disc_real_4:[91m 0.24933594614267357 [0m(+0.040342519770969115)
     | > avg_loss_disc_real_5:[91m 0.24914496879685996 [0m(+0.007976225560361566)
     | > avg_loss_0:[92m 2.632109479470687 [0m(-0.06646544066342486)
     | > avg_loss_gen:[91m 2.0088981455022648 [0m(+0.098884262821892)
     | > avg_loss_kl:[92m 1.8039923277768222 [0m(-0.020930099487304732)
     | > avg_loss_feat:[91m 4.172747109153051 [0m(+0.010415768623347255)
     | > avg_loss_mel

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.008993768692016598 [0m(+0.00015387968583539975)
     | > avg_loss_disc:[91m 2.6467043356461963 [0m(+0.014594856175509374)
     | > avg_loss_disc_real_0:[91m 0.25373346019874904 [0m(+0.11511743170293873)
     | > avg_loss_disc_real_1:[92m 0.2143338906494054 [0m(-0.052839554168961134)
     | > avg_loss_disc_real_2:[91m 0.23764083182269877 [0m(+0.0013631433248519897)
     | > avg_loss_disc_real_3:[92m 0.22894084575501356 [0m(-0.03596811822869578)
     | > avg_loss_disc_real_4:[92m 0.2353962230411443 [0m(-0.013939723101529272)
     | > avg_loss_disc_real_5:[92m 0.24713220596313487 [0m(-0.0020127628337250902)
     | > avg_loss_0:[91m 2.6467043356461963 [0m(+0.014594856175509374)
     | > avg_loss_gen:[91m 2.0160228859294547 [0m(+0.007124740427189913)
     | > avg_loss_kl:[91m 1.8328337821093472 [0m(+0.02884145433252505)
     | > avg_loss_feat:[92m 4.0845135472037555 [0m(-0.08823356194929577)
     | > avg

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.009020534428683192 [0m(+2.676573666659382e-05)
     | > avg_loss_disc:[91m 2.652299577539616 [0m(+0.005595241893419534)
     | > avg_loss_disc_real_0:[92m 0.1757214363325726 [0m(-0.07801202386617645)
     | > avg_loss_disc_real_1:[91m 0.22654537639834665 [0m(+0.012211485748941264)
     | > avg_loss_disc_real_2:[91m 0.2665772580287673 [0m(+0.02893642620606854)
     | > avg_loss_disc_real_3:[91m 0.2322038791396401 [0m(+0.0032630333846265513)
     | > avg_loss_disc_real_4:[91m 0.23573052395473826 [0m(+0.0003343009135939601)
     | > avg_loss_disc_real_5:[92m 0.22745067423040216 [0m(-0.01968153173273271)
     | > avg_loss_0:[91m 2.652299577539616 [0m(+0.005595241893419534)
     | > avg_loss_gen:[92m 1.9532636707479303 [0m(-0.06275921518152439)
     | > avg_loss_kl:[92m 1.7869010816920887 [0m(-0.0459327004172585)
     | > avg_loss_feat:[91m 4.4705576658248924 [0m(+0.38604411862113697)
     | > avg_loss_m

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.009035929766568267 [0m(+1.5395337885075466e-05)
     | > avg_loss_disc:[91m 2.727060012383894 [0m(+0.0747604348442783)
     | > avg_loss_disc_real_0:[92m 0.1652330455454913 [0m(-0.010488390787081275)
     | > avg_loss_disc_real_1:[91m 0.2778320694511584 [0m(+0.05128669305281175)
     | > avg_loss_disc_real_2:[92m 0.2550421568480405 [0m(-0.011535101180726814)
     | > avg_loss_disc_real_3:[91m 0.29406354644081795 [0m(+0.06185966730117784)
     | > avg_loss_disc_real_4:[91m 0.2646413954821498 [0m(+0.028910871527411564)
     | > avg_loss_disc_real_5:[91m 0.23295050046660684 [0m(+0.00549982623620468)
     | > avg_loss_0:[91m 2.727060012383894 [0m(+0.0747604348442783)
     | > avg_loss_gen:[91m 2.061070566827602 [0m(+0.10780689607967187)
     | > avg_loss_kl:[91m 1.8051738413897427 [0m(+0.018272759697653962)
     | > avg_loss_feat:[92m 4.178379013321618 [0m(-0.29217865250327435)
     | > avg_loss_mel:[9

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.00925726890563965 [0m(+0.00022133913907138383)
     | > avg_loss_disc:[92m 2.6973066069863063 [0m(-0.029753405397587862)
     | > avg_loss_disc_real_0:[91m 0.19898059964179993 [0m(+0.03374755409630861)
     | > avg_loss_disc_real_1:[92m 0.24852426905523656 [0m(-0.029307800395921846)
     | > avg_loss_disc_real_2:[91m 0.27600720551880925 [0m(+0.02096504867076876)
     | > avg_loss_disc_real_3:[92m 0.22760853740301998 [0m(-0.06645500903779797)
     | > avg_loss_disc_real_4:[92m 0.2428733306852254 [0m(-0.02176806479692442)
     | > avg_loss_disc_real_5:[91m 0.2355040654540062 [0m(+0.002553564987399365)
     | > avg_loss_0:[92m 2.6973066069863063 [0m(-0.029753405397587862)
     | > avg_loss_gen:[92m 2.0175450227477345 [0m(-0.04352554407986764)
     | > avg_loss_kl:[92m 1.7992412773045627 [0m(-0.005932564085179948)
     | > avg_loss_feat:[91m 4.433280179717322 [0m(+0.2549011663957037)
     | > avg_loss_m

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.009094736792824477 [0m(-0.00016253211281517382)
     | > avg_loss_disc:[91m 2.7476161566647614 [0m(+0.050309549678455134)
     | > avg_loss_disc_real_0:[91m 0.207763327116316 [0m(+0.008782727474516083)
     | > avg_loss_disc_real_1:[91m 0.26139856766570707 [0m(+0.012874298610470514)
     | > avg_loss_disc_real_2:[92m 0.2558159786191852 [0m(-0.02019122689962405)
     | > avg_loss_disc_real_3:[91m 0.2927618765018204 [0m(+0.06515333909880042)
     | > avg_loss_disc_real_4:[91m 0.2836745510047131 [0m(+0.04080122031948771)
     | > avg_loss_disc_real_5:[91m 0.300874180549925 [0m(+0.06537011509591878)
     | > avg_loss_0:[91m 2.7476161566647614 [0m(+0.050309549678455134)
     | > avg_loss_gen:[91m 2.1973374507643952 [0m(+0.17979242801666073)
     | > avg_loss_kl:[91m 1.8064810980450023 [0m(+0.0072398207404396064)
     | > avg_loss_feat:[92m 4.375297641754149 [0m(-0.05798253796317265)
     | > avg_loss_mel

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.009040468389337716 [0m(-5.426840348676104e-05)
     | > avg_loss_disc:[92m 2.6749726187099094 [0m(-0.07264353795485201)
     | > avg_loss_disc_real_0:[92m 0.18518026714975183 [0m(-0.02258305996656418)
     | > avg_loss_disc_real_1:[92m 0.25814908851276747 [0m(-0.003249479152939605)
     | > avg_loss_disc_real_2:[92m 0.2254534044049003 [0m(-0.0303625742142849)
     | > avg_loss_disc_real_3:[92m 0.23703226853500714 [0m(-0.055729607966813255)
     | > avg_loss_disc_real_4:[92m 0.22930611900307915 [0m(-0.054368432001633965)
     | > avg_loss_disc_real_5:[92m 0.19871076399629767 [0m(-0.10216341655362732)
     | > avg_loss_0:[92m 2.6749726187099094 [0m(-0.07264353795485201)
     | > avg_loss_gen:[92m 1.8904438138008117 [0m(-0.3068936369635835)
     | > avg_loss_kl:[92m 1.7338227878917347 [0m(-0.07265831015326762)
     | > avg_loss_feat:[92m 4.326395728371359 [0m(-0.04890191338278971)
     | > avg_loss_mel

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.008824788440357554 [0m(-0.00021567994898016168)
     | > avg_loss_disc:[91m 2.77340053428303 [0m(+0.09842791557312047)
     | > avg_loss_disc_real_0:[92m 0.14265569130128086 [0m(-0.04252457584847097)
     | > avg_loss_disc_real_1:[91m 0.263440850783478 [0m(+0.005291762270710543)
     | > avg_loss_disc_real_2:[91m 0.3103587299585341 [0m(+0.08490532555363378)
     | > avg_loss_disc_real_3:[91m 0.2748345419764518 [0m(+0.037802273441444656)
     | > avg_loss_disc_real_4:[91m 0.2747954130172728 [0m(+0.04548929401419363)
     | > avg_loss_disc_real_5:[91m 0.22367938255721873 [0m(+0.024968618560921058)
     | > avg_loss_0:[91m 2.77340053428303 [0m(+0.09842791557312047)
     | > avg_loss_gen:[91m 1.9579716086387635 [0m(+0.0675277948379518)
     | > avg_loss_kl:[91m 1.82333382693204 [0m(+0.08951103904030533)
     | > avg_loss_feat:[92m 4.3213253194635515 [0m(-0.005070408907807966)
     | > avg_loss_mel:[92m

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.00916966091502797 [0m(+0.00034487247467041536)
     | > avg_loss_disc:[92m 2.7019639730453493 [0m(-0.07143656123768061)
     | > avg_loss_disc_real_0:[91m 0.2306851248849522 [0m(+0.08802943358367132)
     | > avg_loss_disc_real_1:[92m 0.2583386319604787 [0m(-0.005102218822999327)
     | > avg_loss_disc_real_2:[92m 0.29787488363005893 [0m(-0.012483846328475157)
     | > avg_loss_disc_real_3:[92m 0.21337938809936696 [0m(-0.061455153877084834)
     | > avg_loss_disc_real_4:[91m 0.2876606244932522 [0m(+0.01286521147597941)
     | > avg_loss_disc_real_5:[91m 0.22770651226693933 [0m(+0.004027129709720606)
     | > avg_loss_0:[92m 2.7019639730453493 [0m(-0.07143656123768061)
     | > avg_loss_gen:[91m 2.121966665441337 [0m(+0.16399505680257365)
     | > avg_loss_kl:[92m 1.7988848263567143 [0m(-0.02444900057532573)
     | > avg_loss_feat:[91m 4.344122086871755 [0m(+0.022796767408203245)
     | > avg_loss_me

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.009117141636935144 [0m(-5.251927809282568e-05)
     | > avg_loss_disc:[92m 2.6425885547291146 [0m(-0.05937541831623472)
     | > avg_loss_disc_real_0:[92m 0.2101972743868828 [0m(-0.020487850498069393)
     | > avg_loss_disc_real_1:[92m 0.20650046833536842 [0m(-0.05183816362511026)
     | > avg_loss_disc_real_2:[92m 0.19960841197859158 [0m(-0.09826647165146735)
     | > avg_loss_disc_real_3:[92m 0.2104550452394919 [0m(-0.0029243428598750565)
     | > avg_loss_disc_real_4:[92m 0.22724082402207635 [0m(-0.060419800471175844)
     | > avg_loss_disc_real_5:[91m 0.2579163879156113 [0m(+0.030209875648671947)
     | > avg_loss_0:[92m 2.6425885547291146 [0m(-0.05937541831623472)
     | > avg_loss_gen:[92m 1.9139619447968224 [0m(-0.2080047206445148)
     | > avg_loss_kl:[92m 1.769799980250272 [0m(-0.029084846106442352)
     | > avg_loss_feat:[91m 4.38481865796176 [0m(+0.04069657109000513)
     | > avg_loss_mel

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.009059906005859375 [0m(-5.7235631075769e-05)
     | > avg_loss_disc:[92m 2.5869272622195156 [0m(-0.05566129250959895)
     | > avg_loss_disc_real_0:[92m 0.13372942168604235 [0m(-0.07646785270084044)
     | > avg_loss_disc_real_1:[91m 0.27124777056954125 [0m(+0.06474730223417283)
     | > avg_loss_disc_real_2:[91m 0.23149757547812028 [0m(+0.031889163499528694)
     | > avg_loss_disc_real_3:[91m 0.22210465425794776 [0m(+0.011649609018455853)
     | > avg_loss_disc_real_4:[91m 0.2650617203929208 [0m(+0.03782089637084446)
     | > avg_loss_disc_real_5:[92m 0.2541473368352109 [0m(-0.0037690510804003763)
     | > avg_loss_0:[92m 2.5869272622195156 [0m(-0.05566129250959895)
     | > avg_loss_gen:[91m 2.037994466044684 [0m(+0.12403252124786168)
     | > avg_loss_kl:[91m 1.7716611060229215 [0m(+0.001861125772649519)
     | > avg_loss_feat:[91m 4.445691980015144 [0m(+0.060873322053383916)
     | > avg_loss_me

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.00893500501459295 [0m(-0.00012490099126642555)
     | > avg_loss_disc:[91m 2.7655869548971004 [0m(+0.17865969267758475)
     | > avg_loss_disc_real_0:[91m 0.25037179941480814 [0m(+0.11664237772876579)
     | > avg_loss_disc_real_1:[92m 0.25680616768923664 [0m(-0.014441602880304605)
     | > avg_loss_disc_real_2:[91m 0.2660060683434659 [0m(+0.03450849286534563)
     | > avg_loss_disc_real_3:[91m 0.22221117046746341 [0m(+0.00010651620951565577)
     | > avg_loss_disc_real_4:[92m 0.23306647417220203 [0m(-0.03199524622071878)
     | > avg_loss_disc_real_5:[92m 0.2108676636760885 [0m(-0.0432796731591224)
     | > avg_loss_0:[91m 2.7655869548971004 [0m(+0.17865969267758475)
     | > avg_loss_gen:[92m 1.9355171290310946 [0m(-0.10247733701358941)
     | > avg_loss_kl:[91m 1.816548844900998 [0m(+0.04488773887807662)
     | > avg_loss_feat:[91m 4.499958101185887 [0m(+0.05426612117074292)
     | > avg_loss_mel

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.008870374072681772 [0m(-6.463094191117708e-05)
     | > avg_loss_disc:[92m 2.684449653192 [0m(-0.08113730170510047)
     | > avg_loss_disc_real_0:[92m 0.21113568517294798 [0m(-0.03923611424186016)
     | > avg_loss_disc_real_1:[92m 0.23464962216940793 [0m(-0.022156545519828708)
     | > avg_loss_disc_real_2:[91m 0.2760057313875719 [0m(+0.009999663044105989)
     | > avg_loss_disc_real_3:[91m 0.25116598958318875 [0m(+0.02895481911572534)
     | > avg_loss_disc_real_4:[91m 0.24095539396459406 [0m(+0.007888919792392035)
     | > avg_loss_disc_real_5:[91m 0.22967196662317624 [0m(+0.018804302947087737)
     | > avg_loss_0:[92m 2.684449653192 [0m(-0.08113730170510047)
     | > avg_loss_gen:[91m 1.9980759328061897 [0m(+0.06255880377509504)
     | > avg_loss_kl:[92m 1.8078401207923889 [0m(-0.008708724108609234)
     | > avg_loss_feat:[92m 4.292635794119405 [0m(-0.20732230706648203)
     | > avg_loss_mel:[9

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.009383335980502042 [0m(+0.0005129619078202696)
     | > avg_loss_disc:[92m 2.6702348253943713 [0m(-0.014214827797628615)
     | > avg_loss_disc_real_0:[92m 0.1788944234902208 [0m(-0.032241261682727174)
     | > avg_loss_disc_real_1:[92m 0.21911580549045043 [0m(-0.015533816678957507)
     | > avg_loss_disc_real_2:[92m 0.2461485189470378 [0m(-0.029857212440534103)
     | > avg_loss_disc_real_3:[91m 0.29513139697638424 [0m(+0.04396540739319549)
     | > avg_loss_disc_real_4:[91m 0.28064728785644877 [0m(+0.03969189389185471)
     | > avg_loss_disc_real_5:[91m 0.29437563487074603 [0m(+0.06470366824756979)
     | > avg_loss_0:[92m 2.6702348253943713 [0m(-0.014214827797628615)
     | > avg_loss_gen:[91m 2.202730663256212 [0m(+0.20465473045002236)
     | > avg_loss_kl:[92m 1.79115948677063 [0m(-0.0166806340217589)
     | > avg_loss_feat:[91m 4.595949855717744 [0m(+0.3033140615983392)
     | > avg_loss_mel:

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.008923537080938166 [0m(-0.0004597988995638756)
     | > avg_loss_disc:[92m 2.5812769998203624 [0m(-0.08895782557400889)
     | > avg_loss_disc_real_0:[91m 0.24783498834479925 [0m(+0.06894056485457845)
     | > avg_loss_disc_real_1:[91m 0.2824352768334475 [0m(+0.06331947134299706)
     | > avg_loss_disc_real_2:[92m 0.2282577186822891 [0m(-0.01789080026474868)
     | > avg_loss_disc_real_3:[92m 0.23443653664805672 [0m(-0.06069486032832752)
     | > avg_loss_disc_real_4:[92m 0.26541069664738404 [0m(-0.015236591209064732)
     | > avg_loss_disc_real_5:[92m 0.24298986833203923 [0m(-0.0513857665387068)
     | > avg_loss_0:[92m 2.5812769998203624 [0m(-0.08895782557400889)
     | > avg_loss_gen:[91m 2.291307395154778 [0m(+0.08857673189856596)
     | > avg_loss_kl:[91m 1.813691666993228 [0m(+0.022532180222597953)
     | > avg_loss_feat:[92m 4.573971143635835 [0m(-0.021978712081908824)
     | > avg_loss_mel:

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.008812874013727367 [0m(-0.00011066306721079985)
     | > avg_loss_disc:[91m 2.731245166605169 [0m(+0.14996816678480673)
     | > avg_loss_disc_real_0:[92m 0.1352551654319872 [0m(-0.11257982291281204)
     | > avg_loss_disc_real_1:[92m 0.23036986589431763 [0m(-0.052065410939129864)
     | > avg_loss_disc_real_2:[91m 0.28374610055576677 [0m(+0.055488381873477655)
     | > avg_loss_disc_real_3:[91m 0.2922613829374313 [0m(+0.05782484628937459)
     | > avg_loss_disc_real_4:[91m 0.3153804459355094 [0m(+0.049969749288125365)
     | > avg_loss_disc_real_5:[91m 0.2999427428299731 [0m(+0.05695287449793385)
     | > avg_loss_0:[91m 2.731245166605169 [0m(+0.14996816678480673)
     | > avg_loss_gen:[92m 2.285644846612757 [0m(-0.005662548542021195)
     | > avg_loss_kl:[91m 1.8184578288685191 [0m(+0.004766161875291219)
     | > avg_loss_feat:[91m 4.850425720214842 [0m(+0.27645457657900696)
     | > avg_loss_mel:

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.009019977396184748 [0m(+0.00020710338245738188)
     | > avg_loss_disc:[92m 2.685571752895008 [0m(-0.04567341371016109)
     | > avg_loss_disc_real_0:[91m 0.1774330477822911 [0m(+0.0421778823503039)
     | > avg_loss_disc_real_1:[91m 0.24471767070618541 [0m(+0.014347804811867787)
     | > avg_loss_disc_real_2:[92m 0.2502061399546539 [0m(-0.03353996060111286)
     | > avg_loss_disc_real_3:[92m 0.22973055866631595 [0m(-0.06253082427111537)
     | > avg_loss_disc_real_4:[92m 0.22518757310780613 [0m(-0.09019287282770327)
     | > avg_loss_disc_real_5:[92m 0.2331109189174392 [0m(-0.06683182391253387)
     | > avg_loss_0:[92m 2.685571752895008 [0m(-0.04567341371016109)
     | > avg_loss_gen:[92m 1.9123474890535528 [0m(-0.373297357559204)
     | > avg_loss_kl:[92m 1.7579329924149947 [0m(-0.060524836453524467)
     | > avg_loss_feat:[92m 4.33151340267875 [0m(-0.5189123175360919)
     | > avg_loss_mel:[91m 

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.008904695510864261 [0m(-0.00011528188532048712)
     | > avg_loss_disc:[92m 2.655747732249173 [0m(-0.029824020645834892)
     | > avg_loss_disc_real_0:[92m 0.11497072591023012 [0m(-0.06246232187206098)
     | > avg_loss_disc_real_1:[91m 0.29756539463996906 [0m(+0.05284772393378365)
     | > avg_loss_disc_real_2:[91m 0.28724732724103075 [0m(+0.03704118728637684)
     | > avg_loss_disc_real_3:[91m 0.2346158214590766 [0m(+0.004885262792760664)
     | > avg_loss_disc_real_4:[91m 0.24247555868192153 [0m(+0.0172879855741154)
     | > avg_loss_disc_real_5:[92m 0.2139318500052799 [0m(-0.019179068912159314)
     | > avg_loss_0:[92m 2.655747732249173 [0m(-0.029824020645834892)
     | > avg_loss_gen:[91m 1.997154617309571 [0m(+0.0848071282560181)
     | > avg_loss_kl:[91m 1.8182215332984923 [0m(+0.060288540883497665)
     | > avg_loss_feat:[91m 4.522523496367715 [0m(+0.1910100936889645)
     | > avg_loss_mel:

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.009152882749384098 [0m(+0.00024818723851983623)
     | > avg_loss_disc:[92m 2.554439969496293 [0m(-0.10130776275288023)
     | > avg_loss_disc_real_0:[91m 0.13072747716849498 [0m(+0.015756751258264853)
     | > avg_loss_disc_real_1:[92m 0.22366285053166476 [0m(-0.0739025441083043)
     | > avg_loss_disc_real_2:[92m 0.25848272727294397 [0m(-0.02876459996808678)
     | > avg_loss_disc_real_3:[92m 0.23065950667316262 [0m(-0.003956314785913989)
     | > avg_loss_disc_real_4:[91m 0.2770507858558136 [0m(+0.03457522717389208)
     | > avg_loss_disc_real_5:[91m 0.2414113399657336 [0m(+0.027479489960453712)
     | > avg_loss_0:[92m 2.554439969496293 [0m(-0.10130776275288023)
     | > avg_loss_gen:[91m 2.13484359221025 [0m(+0.13768897490067933)
     | > avg_loss_kl:[91m 1.839314650405537 [0m(+0.021093117107044757)
     | > avg_loss_feat:[91m 4.856792389262806 [0m(+0.3342688928950919)
     | > avg_loss_mel:[9

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.009078784422440962 [0m(-7.409832694313563e-05)
     | > avg_loss_disc:[91m 2.6593308058652 [0m(+0.10489083636890717)
     | > avg_loss_disc_real_0:[91m 0.1970122777602889 [0m(+0.06628480059179392)
     | > avg_loss_disc_real_1:[91m 0.27639342424544416 [0m(+0.0527305737137794)
     | > avg_loss_disc_real_2:[91m 0.26927172216502104 [0m(+0.010788994892077075)
     | > avg_loss_disc_real_3:[91m 0.2553422804583203 [0m(+0.02468277378515768)
     | > avg_loss_disc_real_4:[92m 0.25493231361562574 [0m(-0.02211847224018787)
     | > avg_loss_disc_real_5:[91m 0.2529537733305583 [0m(+0.011542433364824689)
     | > avg_loss_0:[91m 2.6593308058652 [0m(+0.10489083636890717)
     | > avg_loss_gen:[91m 2.161815135045483 [0m(+0.026971542835232842)
     | > avg_loss_kl:[91m 1.8596554084257646 [0m(+0.0203407580202275)
     | > avg_loss_feat:[92m 4.526265018636529 [0m(-0.33052737062627724)
     | > avg_loss_mel:[91m 19

: 

In [1]:
from huggingface_hub import HfApi, upload_folder

api_token = "your_token"
repo_id = "sifat1221/vits_bn_tts_checkpoint_25000"
local_dir = '/teamspace/studios/this_studio/model/male_vits_23_dec_2024-December-23-2024_09+14AM-0000000'

api = HfApi(token=api_token)
api.create_repo(repo_id=repo_id, repo_type="model", private=False, exist_ok=True)

upload_folder(
    folder_path=local_dir,
    path_in_repo="",
    repo_id=repo_id,
    token=api_token,
    repo_type="model",
)
print(f"Files from '{local_dir}' uploaded to the repository '{repo_id}' successfully.")

best_model_19443.pth:   0%|          | 0.00/998M [00:00<?, ?B/s]

checkpoint_25000.pth:   0%|          | 0.00/998M [00:00<?, ?B/s]

checkpoint_20000.pth:   0%|          | 0.00/998M [00:00<?, ?B/s]

Upload 13 LFS files:   0%|          | 0/13 [00:00<?, ?it/s]

best_model_19443.pth:   0%|          | 0.00/998M [00:00<?, ?B/s]

events.out.tfevents.1734945294.ip-10-192-12-121.2189.1:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

events.out.tfevents.1734975209.ip-10-192-12-225.1939.0:   0%|          | 0.00/31.7M [00:00<?, ?B/s]

events.out.tfevents.1734981356.ip-10-192-12-225.92014.0:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

events.out.tfevents.1734981663.ip-10-192-12-225.92014.1:   0%|          | 0.00/9.24k [00:00<?, ?B/s]

events.out.tfevents.1734982061.ip-10-192-12-170.2377.0:   0%|          | 0.00/20.7M [00:00<?, ?B/s]

events.out.tfevents.1734986943.ip-10-192-12-170.29923.0:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

events.out.tfevents.1735012004.ip-10-192-12-114.1960.0:   0%|          | 0.00/14.7M [00:00<?, ?B/s]

events.out.tfevents.1735020584.ip-10-192-12-126.2401.0:   0%|          | 0.00/9.24k [00:00<?, ?B/s]

events.out.tfevents.1735020769.ip-10-192-12-115.2921.0:   0%|          | 0.00/25.8M [00:00<?, ?B/s]

Files from '/teamspace/studios/this_studio/model/male_vits_23_dec_2024-December-23-2024_09+14AM-0000000' uploaded to the repository 'sifat1221/vits_bn_tts_checkpoint_25000' successfully.
