In [None]:
%%capture
!pip install TTS
!sudo apt-get install espeak-ng -y
!pip install gdown
!sudo apt-get install unzip
!gdown "https://drive.google.com/uc?id=your_id"
!unzip /teamspace/studios/this_studio/data.zip -d /teamspace/studios/this_studio/data

In [None]:
from huggingface_hub import snapshot_download
local_data_dir = "/teamspace/studios/this_studio/model/male_vits_23_dec_2024-December-23-2024_09+14AM-0000000"
dataset_repo = "sifat1221/vits_bn_tts_checkpoint_125000"
file_path = snapshot_download(repo_id=dataset_repo, local_dir=local_data_dir)

In [None]:
import os
folder_path = "/teamspace/studios/this_studio/data/data/wav"
file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
print(f"Number of files in '{folder_path}': {file_count}")

In [2]:
import numpy as np
import pandas as pd
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.utils.audio import AudioProcessor
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.models.vits import Vits, VitsAudioConfig, CharactersConfig

In [None]:
if "COLAB_GPU" in os.environ:
    from google.colab import drive
    drive.mount('/content/drive')
    data_root = '/content/drive/MyDrive/IIT/4-2/ML/ML-Project_Team_minions/data'
    model_root = '/content/drive/MyDrive/IIT/4-2/ML/ML-Project_Team_minions/Models'
    
elif os.path.exists("/kaggle"):
    print("Running in Kaggle")
    data_root = '/kaggle/input/bangla-audio/data'
    model_root = '/kaggle/working/'

else:
    print("Running in Lightening")
    data_root = '/teamspace/studios/this_studio/data/data'
    model_root = '/teamspace/studios/this_studio/model'


In [4]:
male = True
pretrained = True
pretrained_path = ""
if(pretrained):
    pretrained_path = '/teamspace/studios/this_studio/model/male_vits_23_dec_2024-December-23-2024_09+14AM-0000000'
    
if(male):
    meta_file = f'{data_root}/metadata.csv'
    root_path = f'{data_root}'
else:
    meta_file = f'{data_root}/female/mono/metadata_female.txt'
    root_path = f'{data_root}/female/mono'

In [5]:
def formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    txt_file = meta_file
    items = []
    speaker_name = "ljspeech"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.join(root_path, "wav", cols[0] + ".wav")
            print("wav_file:",wav_file)
            text = ''
            try:
                text = cols[2]
                print("text:",text)
            except:
                print("not found")
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
    return items

In [None]:
dataset_config = BaseDatasetConfig(meta_file_train=meta_file, path=os.path.join(root_path, ""))

train_samples, eval_samples = load_tts_samples(dataset_config,formatter=formatter, eval_split=True, eval_split_size=0.2)
# print(help(load_tts_samples))

print(len(train_samples),len(eval_samples))

In [None]:
output_path = model_root
phoneme_cache_path = os.path.join(output_path, "phoneme_cache")
os.makedirs(output_path,exist_ok=True)


audio_config = VitsAudioConfig(sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None)
config = VitsConfig(
    audio=audio_config,
    run_name="male_vits_23_dec_2024",
    batch_size=48,
    eval_batch_size=32,
    epochs=1000,
    save_step=5000,
    print_step=500,
    batch_group_size=0,
    num_loader_workers=8,
    num_eval_loader_workers=8,
    run_eval=True,
    test_delay_epochs=-1,
    phonemizer="espeak",
    text_cleaner='phoneme_cleaners', #'multilingual_cleaners', #"collapse_whitespace" phoneme_cleaners multilingual_cleaners
    use_phonemes=True,
    phoneme_language="bn",
    phoneme_cache_path=phoneme_cache_path,
    compute_input_seq_cache=True,
    add_blank=True,
    use_language_weighted_sampler = True,
    print_eval=False,
    mixed_precision=False,
    output_path=output_path,
    datasets=[dataset_config],
    # characters = characters_config,
    cudnn_benchmark=True,
    test_sentences = [
        'আমার   সোনার বাংলা, আমি তোমায় ভালোবাসি।',
        'চিরদিন   তোমার আকাশ, তোমার বাতাস, আমার প্রাণে বাজায় বাঁশি',
        'ও মা,   ফাগুনে তোর আমের বনে ঘ্রাণে পাগল করে,মরি হায়, হায় রে।'
    ]
)

ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)
model = Vits(config, ap, tokenizer, speaker_manager=None)
trainer = Trainer(TrainerArgs(continue_path = pretrained_path), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples)

In [None]:
%%time
trainer.fit()

In [None]:
from huggingface_hub import HfApi, upload_folder

api_token = "your_api_token"
repo_id = "sifat1221/vits_bn_tts_checkpoint_140000"
local_dir = '/teamspace/studios/this_studio/model/male_vits_23_dec_2024-December-23-2024_09+14AM-0000000'

api = HfApi(token=api_token)
api.create_repo(repo_id=repo_id, repo_type="model", private=False, exist_ok=True)

upload_folder(
    folder_path=local_dir,
    path_in_repo="",
    repo_id=repo_id,
    token=api_token,
    repo_type="model",
)
print(f"Files from '{local_dir}' uploaded to the repository '{repo_id}' successfully.")