In [1]:
# ===== COMPLETE WORKING VITS TRAINING SCRIPT =====

# Define base directories
import os
BASE = r"C:\Users\ReticleX\Pictures\nepali_tts"  
OUTPUT = os.path.join(BASE, "vits_output_v2")

import sys
import torch
import json
import time
from pathlib import Path
import logging
print("=" * 70)
print("VITS TTS Dependencies installed for TRAINING - COMPLETE WORKING VERSION")
print("=" * 70)

VITS TTS Dependencies installed for TRAINING - COMPLETE WORKING VERSION


In [9]:
import torchaudio
print("torchaudio version:", torchaudio.__version__)
print("has set_audio_backend:", hasattr(torchaudio, "set_audio_backend"))

torchaudio version: 2.9.1+cpu
has set_audio_backend: False


In [2]:
# ===== STEP 2: IMPORTS =====
print("\nüì¶ Importing modules...")
try:
    from TTS.config.shared_configs import BaseDatasetConfig
    from TTS.tts.configs.vits_config import VitsConfig
    from TTS.tts.models.vits import Vits
    from TTS.tts.utils.text.tokenizer import TTSTokenizer
    from TTS.tts.utils.text.characters import Graphemes
    from TTS.utils.audio import AudioProcessor
    from TTS.tts.datasets import load_tts_samples
    from trainer import Trainer, TrainerArgs
    print("‚úÖ All imports successful")
except ImportError as e:
    print(f"‚ùå Import error: {e}")
    print("\nüí° Install TTS: pip install TTS")
    sys.exit(1)


üì¶ Importing modules...


  from pkg_resources import resource_filename


‚úÖ All imports successful


In [3]:
# ===== STEP 3: CREATE NEPALI CHARACTER SET =====
print("\nüìù Creating Nepali character set...")

nepali_vocab = []

# Vowels
vowels = ['‡§Ö', '‡§Ü', '‡§á', '‡§à', '‡§â', '‡§ä', '‡§ã', '‡§è', '‡§ê', '‡§ì', '‡§î']
nepali_vocab.extend(vowels)

# Consonants
consonants = [
    '‡§ï', '‡§ñ', '‡§ó', '‡§ò', '‡§ô',
    '‡§ö', '‡§õ', '‡§ú', '‡§ù', '‡§û',
    '‡§ü', '‡§†', '‡§°', '‡§¢', '‡§£',
    '‡§§', '‡§•', '‡§¶', '‡§ß', '‡§®',
    '‡§™', '‡§´', '‡§¨', '‡§≠', '‡§Æ',
    '‡§Ø', '‡§∞', '‡§≤', '‡§µ', '‡§∂', '‡§∑', '‡§∏', '‡§π'
]
nepali_vocab.extend(consonants)

# Vowel signs
vowel_signs = ['‡§æ', '‡§ø', '‡•Ä', '‡•Å', '‡•Ç', '‡•É', '‡•á', '‡•à', '‡•ã', '‡•å', '‡•ç']
nepali_vocab.extend(vowel_signs)

# Diacritics
diacritics = ['‡§Ç', '‡§É', '‡§Å']
nepali_vocab.extend(diacritics)

# Nepali digits
digits = ['‡•¶', '‡•ß', '‡•®', '‡•©', '‡•™', '‡•´', '‡•¨', '‡•≠', '‡•Æ', '‡•Ø']
nepali_vocab.extend(digits)

# Latin alphabet and numbers
latin = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
nepali_vocab.extend(latin)

# Common punctuation
common_punct = list(" !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~‡•§")
nepali_vocab.extend(common_punct)

# Remove duplicates and sort
nepali_vocab = sorted(set(nepali_vocab))

print(f"‚úÖ Character set ready ({len(nepali_vocab)} characters)")

# Create Graphemes object
chars_obj = Graphemes(
    characters=nepali_vocab,
    punctuations="‡•§!?,.:; -\"",
    pad="_",
    eos="~",
    bos="^",
    blank="#",
)

# Create tokenizer
tokenizer = TTSTokenizer(
    use_phonemes=False,
    characters=chars_obj,
    add_blank=True,
)

print(f"‚úÖ Tokenizer ready (vocab: {len(tokenizer.characters.characters)})")

# Test tokenizer
test_text = "‡§®‡§Æ‡§∏‡•ç‡§§‡•á"
test_ids = tokenizer.text_to_ids(test_text)
test_decoded = tokenizer.ids_to_text(test_ids)
print(f"   Test: '{test_text}' ‚Üí {len(test_ids)} tokens ‚Üí '{test_decoded}'")


üìù Creating Nepali character set...
‚úÖ Character set ready (164 characters)
‚úÖ Tokenizer ready (vocab: 164)
   Test: '‡§®‡§Æ‡§∏‡•ç‡§§‡•á' ‚Üí 13 tokens ‚Üí '#‡§®#‡§Æ#‡§∏#‡•ç#‡§§#‡•á#'


In [4]:
# ===== STEP 4: DATASET CONFIGURATION =====
print("\nüìä Setting up dataset...")

dataset_config = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train=os.path.join(BASE, "dataset", "ljspeech_train", "metadata.csv"),
    meta_file_val=os.path.join(BASE, "dataset", "ljspeech_val", "metadata.csv"),
    path=os.path.join(BASE, "dataset", "ljspeech_train"),
    language="ne",
)


üìä Setting up dataset...


In [5]:
# ===== STEP 6: LOAD DATASET SAMPLES =====
print("\nüìÇ Loading dataset samples...")

try:
    # Load training samples
    train_samples, eval_samples = load_tts_samples(
        [dataset_config],
        eval_split=True,
        eval_split_max_size=256,
        eval_split_size=0.15,
    )
    
    print(f"‚úÖ Data loaded:")
    print(f"   Training samples: {len(train_samples)}")
    print(f"   Validation samples: {len(eval_samples)}")
    
    if len(train_samples) == 0:
        raise Exception("No training samples found!")
    
except Exception as e:
    print(f"‚ùå Error loading data: {e}")
    print("\nüí° Check:")
    print(f"   1. File exists: {dataset_config.meta_file_train}")
    print(f"   2. Audio files exist in: {dataset_config.path}")
    print(f"   3. Format: filename|text (LJSpeech format)")
    sys.exit(1)


üìÇ Loading dataset samples...
 | > Found 6082 files in C:\Users\ReticleX\Pictures\nepali_tts\dataset\ljspeech_train
‚úÖ Data loaded:
   Training samples: 6082
   Validation samples: 1065


In [6]:
# ===== CRITICAL FIX: Use BaseAudioConfig instead of dict =====
# Replace your audio config creation with this:

from TTS.config.shared_configs import BaseAudioConfig
import logging
import sys

print("\nüéµ Creating audio config (FIXED)...")

# CORRECT: Use BaseAudioConfig object, not dict
audio_config = BaseAudioConfig(
    sample_rate=22050,
    hop_length=256,
    win_length=1024,
    fft_size=1024,
    num_mels=80,
    mel_fmin=0.0,
    mel_fmax=8000.0,
)

# Create audio processor from config
ap = AudioProcessor.init_from_config(audio_config)
print(f"‚úÖ Audio processor: {ap.sample_rate} Hz")

# Now when you create VitsConfig:
config = VitsConfig()
config.audio = audio_config  # This is now a proper object, not a dict!
config.output_path = OUTPUT
config.run_name = "nepali_vits"

# Set other attributes
config.datasets = [dataset_config]
config.batch_size = 4
config.eval_batch_size = 2
config.num_loader_workers = 0
config.num_eval_loader_workers = 0
config.epochs = 100
config.text_cleaner = "basic_cleaners"
config.use_phonemes = False
config.add_blank = True
config.characters = None
config.num_chars = len(tokenizer.characters.characters)
config.lr = 2e-4
config.print_step = 25
config.save_step = 1000
config.save_n_checkpoints = 5
config.run_eval = True
config.test_sentences = ["‡§®‡§Æ‡§∏‡•ç‡§§‡•á", "‡§ß‡§®‡•ç‡§Ø‡§µ‡§æ‡§¶"]

print("‚úÖ Config created with proper audio config")

# Create model
model = Vits(
    config=config,
    ap=ap,
    tokenizer=tokenizer,
    speaker_manager=None,
)

if torch.cuda.is_available():
    model.cuda()

print(f"‚úÖ Model ready ({sum(p.numel() for p in model.parameters()):,} params)")

# Add this cell BEFORE creating the trainer (after model creation)

import logging
import sys

print("\nüîß Configuring logging to disable file output...")

# Disable all file handlers for the trainer
logging.getLogger("trainer").handlers = []
logging.getLogger("TTS").handlers = []

# Configure console-only logging
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(message)s')
console_handler.setFormatter(formatter)

# Apply to relevant loggers
for logger_name in ["trainer", "TTS"]:
    logger = logging.getLogger(logger_name)
    logger.handlers = [console_handler]
    logger.propagate = False

print("‚úÖ Logging configured for console output only")

# Create trainer with modified args
trainer_args = TrainerArgs()
trainer_args.use_accelerate = False  # Disable accelerate logging
trainer_args.dashboard_logger = None  # Disable dashboard logging

# CRITICAL: Close any existing log file handles before creating trainer
import gc
gc.collect()  # Force garbage collection to release file handles

# Additional file cleanup: Close any open file handles in the output directory
import os
import psutil

def close_log_files(output_path):
    """Close any open log files in the output directory"""
    try:
        current_process = psutil.Process()
        for file_handle in current_process.open_files():
            if 'trainer_' in file_handle.path and output_path in file_handle.path:
                try:
                    os.close(file_handle.fd)
                    print(f"‚úÖ Closed file handle: {file_handle.path}")
                except:
                    pass
    except Exception as e:
        print(f"‚ö†Ô∏è Could not close handles: {e}")

# Close any existing log files
if os.path.exists(OUTPUT):
    close_log_files(OUTPUT)

# Alternative: Delete the output directory if it exists and recreate it
if os.path.exists(OUTPUT):
    import shutil
    try:
        shutil.rmtree(OUTPUT, ignore_errors=True)
        print(f"‚úÖ Cleaned output directory: {OUTPUT}")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not clean directory: {e}")

os.makedirs(OUTPUT, exist_ok=True)

# Now create the trainer
trainer = Trainer(
    trainer_args,
    config,
    OUTPUT,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)



üéµ Creating audio config (FIXED)...
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
‚úÖ Audio processor: 22050 Hz
‚úÖ Config created with proper audio config
‚úÖ Model ready (83,068,204 params)

üîß Configuring logging to disable file output...
‚úÖ Logging configured for console output only
 > Train

In [7]:
# ========== CELL 7: VITS Configuration (FIXED) ==========
print("\n‚öôÔ∏è VITS configuration...")

# Step 1: Create base config
config = VitsConfig(
    output_path=OUTPUT,
    run_name="nepali_vits",
)

# Step 2: Set attributes
config.datasets = [dataset_config]
config.audio = audio_config

config.batch_size = 4
config.eval_batch_size = 2
config.num_loader_workers = 0
config.num_eval_loader_workers = 0
config.epochs = 100

config.text_cleaner = "basic_cleaners"
config.use_phonemes = False
config.add_blank = True
config.characters = None
config.num_chars = len(tokenizer.characters.characters)

config.optimizer = "AdamW"
config.optimizer_params = {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01}
config.lr = 2e-4
config.lr_scheduler = "ExponentialLR"
config.lr_scheduler_params = {"gamma": 0.999875}

config.print_step = 50
config.plot_step = 0
config.dashboard_logger = None
config.save_step = 1000
config.save_n_checkpoints = 5
config.run_eval = True

config.test_sentences = ["‡§®‡§Æ‡§∏‡•ç‡§§‡•á", "‡§ß‡§®‡•ç‡§Ø‡§µ‡§æ‡§¶"]

print("‚úÖ Config created")




‚öôÔ∏è VITS configuration...
‚úÖ Config created


In [8]:
try:
    # Start training
    trainer.fit()
    
    print("\nüéâ TRAINING COMPLETED SUCCESSFULLY!")
    print(f"üìÅ Checkpoints saved to: {OUTPUT}")

except KeyboardInterrupt:
    print("\n‚ö†Ô∏è Training interrupted by user")
    try:
        if hasattr(trainer, "save_checkpoint"):
            trainer.save_checkpoint()
        elif hasattr(trainer, "model") and hasattr(trainer.model, "save_checkpoint"):
            trainer.model.save_checkpoint(OUTPUT)
        print("‚úÖ Checkpoint saved")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not save checkpoint: {e}")

except ImportError as ie:
    if "torchcodec" in str(ie):
        print("\n‚ùå Missing dependency: torchcodec")
        print("üí° Fix: pip install torchcodec or use librosa as backend")
    else:
        print(f"\n‚ùå Import error: {ie}")
        import traceback
        traceback.print_exc()

except PermissionError as pe:
    print(f"\n‚ùå Permission error: {pe}")
    print("üí° Close programs using the output folder, or change output dir")

except Exception as e:
    print(f"\n‚ùå Training error: {e}")
    import traceback
    traceback.print_exc()



[4m[1m > EPOCH: 0/100[0m
 --> C:\Users\ReticleX\Pictures\nepali_tts\vits_output_v2\nepali_vits-December-17-2025_10+42AM-0000000


> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 6082
 | > Preprocessing samples
 | > Max text length: 106
 | > Min text length: 5
 | > Avg text length: 19.53666557053601
 | 
 | > Max audio length: 154372.0
 | > Min audio length: 33097.0
 | > Avg audio length: 96163.91548832621
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.

[1m > TRAINING (2025-12-17 10:42:24) [0m

‚ùå Permission error: [WinError 32] The process cannot access the file because it is being used by another process: 'C:/Users/ReticleX/Pictures/nepali_tts/vits_output_v2/nepali_vits-December-17-2025_10+42AM-0000000\\trainer_0_log.txt'
üí° Close programs using the output folder, or change output dir
