In [1]:
import os
import pandas as pd
import shutil
import json
from tqdm import tqdm

# ==== CONFIG ====
BASE_FOLDER = r"C:\Users\kriti\OneDrive\Pictures\nepali_tts"
PREPROCESSED_FOLDER = os.path.join(BASE_FOLDER, "preprocessed_audio")
SPLITS_DIR = os.path.join(PREPROCESSED_FOLDER, "metadata", "splits")
AUDIO_DIR = os.path.join(PREPROCESSED_FOLDER, "wavs")
OUTPUT_BASE_DIR = os.path.join(BASE_FOLDER, "dataset")  # FIXED: Full path
# =================

def create_ljspeech_split(split_name, manifest_path, audio_dir, output_base_dir):
    """Create LJSpeech format for a single split"""
    
    print(f"\n{'='*60}")
    print(f"Processing {split_name.upper()} split")
    print(f"{'='*60}")
    
    # Paths
    output_dir = os.path.join(output_base_dir, f"ljspeech_{split_name}")
    wavs_dir = os.path.join(output_dir, "wavs")
    
    # Load manifest
    print(f"üìñ Loading: {manifest_path}")
    if not os.path.exists(manifest_path):
        print(f"‚ùå Manifest not found!")
        return None
        
    df = pd.read_csv(manifest_path)
    print(f"‚úÖ Loaded {len(df)} entries")
    
    # Create directories
    os.makedirs(wavs_dir, exist_ok=True)
    
    # Process each entry
    metadata_lines = []
    copied_files = 0
    skipped_files = 0
    total_duration = 0
    
    print(f"üìã Processing audio files...")
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"{split_name}"):
        utt_id = row['utt_id']
        text = row['text']
        duration = row['duration']
        
        # Source path (from preprocessed wavs)
        src_wav = os.path.join(audio_dir, f"{utt_id}.wav")
        dst_wav = os.path.join(wavs_dir, f"{utt_id}.wav")
        
        # Check if source file exists
        if os.path.exists(src_wav):
            try:
                # Copy the file
                shutil.copy2(src_wav, dst_wav)
                copied_files += 1
                total_duration += duration
                
                # LJSpeech format: filename|text|text_normalized
                # Note: No file extension in filename column
                metadata_lines.append(f"{utt_id}|{text}|{text}")
                
            except Exception as e:
                print(f"\n‚ùå Error copying {utt_id}: {e}")
                skipped_files += 1
        else:
            if skipped_files < 5:  # Show first 5 missing files
                print(f"\n‚ö†Ô∏è File not found: {src_wav}")
            skipped_files += 1
    
    if skipped_files >= 5:
        print(f"\n‚ö†Ô∏è ... and {skipped_files - 5} more missing files")
    
    # Save metadata
    metadata_path = os.path.join(output_dir, "metadata.csv")
    print(f"\nüíæ Saving metadata to: {metadata_path}")
    with open(metadata_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(metadata_lines))
    
    # Calculate statistics
    split_stats = {
        "samples": len(df),
        "copied_files": copied_files,
        "skipped_files": skipped_files,
        "duration_hours": total_duration / 3600,
        "duration_minutes": total_duration / 60,
        "avg_duration_seconds": total_duration / copied_files if copied_files > 0 else 0
    }
    
    # Print summary
    print(f"\nüìä {split_name.upper()} SUMMARY:")
    print(f"   ‚Ä¢ Entries in manifest: {len(df)}")
    print(f"   ‚Ä¢ Files copied: {copied_files}")
    print(f"   ‚Ä¢ Files skipped: {skipped_files}")
    print(f"   ‚Ä¢ Success rate: {(copied_files/len(df)*100):.1f}%")
    print(f"   ‚Ä¢ Total duration: {total_duration/60:.2f} minutes")
    print(f"   ‚Ä¢ Average duration: {split_stats['avg_duration_seconds']:.2f} seconds")
    
    return split_stats

def main():
    print("üéØ CREATING LJSPEECH FORMAT DATASET")
    print("=" * 60)
    
    # Verify directories exist
    print("\nüîç Checking directories...")
    if not os.path.exists(SPLITS_DIR):
        print(f"‚ùå Splits directory not found: {SPLITS_DIR}")
        print("Please run the data splitting script first!")
        return
        
    if not os.path.exists(AUDIO_DIR):
        print(f"‚ùå Audio directory not found: {AUDIO_DIR}")
        print("Please run the preprocessing script first!")
        return
    
    audio_files = [f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]
    print(f"‚úÖ Found {len(audio_files)} audio files in: {AUDIO_DIR}")
    
    # Create output directory
    os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)
    print(f"‚úÖ Output directory: {OUTPUT_BASE_DIR}")
    
    # Statistics
    total_stats = {
        "total_samples": 0,
        "total_duration_hours": 0,
        "splits": {}
    }
    
    # Process each split
    splits = ["train", "val", "test"]
    
    for split_name in splits:
        manifest_path = os.path.join(SPLITS_DIR, f"{split_name}_metadata.csv")
        
        if not os.path.exists(manifest_path):
            print(f"\n‚ö†Ô∏è Skipping {split_name}: manifest not found")
            continue
            
        split_stats = create_ljspeech_split(
            split_name, 
            manifest_path, 
            AUDIO_DIR, 
            OUTPUT_BASE_DIR
        )
        
        if split_stats:
            total_stats["splits"][split_name] = split_stats
            total_stats["total_samples"] += split_stats["copied_files"]
            total_stats["total_duration_hours"] += split_stats["duration_hours"]
    
    # Save statistics
    print(f"\n{'='*60}")
    print("üíæ SAVING DATASET STATISTICS")
    print(f"{'='*60}")
    
    stats_path = os.path.join(OUTPUT_BASE_DIR, "dataset_statistics.json")
    with open(stats_path, 'w', encoding='utf-8') as f:
        json.dump(total_stats, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Statistics saved: {stats_path}")
    
    # Create README
    readme_path = os.path.join(OUTPUT_BASE_DIR, "README.txt")
    with open(readme_path, 'w', encoding='utf-8') as f:
        f.write("TTS READY DATASET - LJSPEECH FORMAT\n")
        f.write("=" * 60 + "\n\n")
        
        f.write("DESCRIPTION:\n")
        f.write("-" * 20 + "\n")
        f.write("This dataset is formatted in LJSpeech style for TTS training.\n\n")
        
        f.write("STRUCTURE:\n")
        f.write("-" * 20 + "\n")
        f.write("dataset/\n")
        f.write("‚îú‚îÄ‚îÄ dataset_statistics.json\n")
        f.write("‚îú‚îÄ‚îÄ README.txt\n")
        
        for split_name in splits:
            if split_name in total_stats['splits']:
                f.write(f"‚îú‚îÄ‚îÄ ljspeech_{split_name}/\n")
                f.write(f"‚îÇ   ‚îú‚îÄ‚îÄ wavs/           # {total_stats['splits'][split_name]['copied_files']} files\n")
                f.write(f"‚îÇ   ‚îî‚îÄ‚îÄ metadata.csv\n")
        
        f.write("\nMETADATA FORMAT:\n")
        f.write("-" * 20 + "\n")
        f.write("filename|text|text_normalized\n\n")
        
        f.write("STATISTICS:\n")
        f.write("-" * 20 + "\n")
        f.write(f"Total samples: {total_stats['total_samples']}\n")
        f.write(f"Total duration: {total_stats['total_duration_hours']:.2f} hours\n\n")
        
        for split_name in splits:
            if split_name in total_stats['splits']:
                stats = total_stats['splits'][split_name]
                f.write(f"{split_name.upper()}:\n")
                f.write(f"  Samples: {stats['copied_files']}\n")
                f.write(f"  Duration: {stats['duration_hours']:.2f} hours\n\n")
    
    print(f"‚úÖ README created: {readme_path}")
    
    # Final summary
    print(f"\n{'='*60}")
    print("üéâ LJSPEECH DATASET CREATION COMPLETE!")
    print(f"{'='*60}")
    print(f"\nüìä SUMMARY:")
    print(f"   ‚Ä¢ Total samples: {total_stats['total_samples']}")
    print(f"   ‚Ä¢ Total duration: {total_stats['total_duration_hours']:.2f} hours")
    
    for split_name in splits:
        if split_name in total_stats['splits']:
            stats = total_stats['splits'][split_name]
            print(f"   ‚Ä¢ {split_name}: {stats['copied_files']} files, {stats['duration_hours']:.2f} hours")
    
    print(f"\nüìÅ Dataset location: {os.path.abspath(OUTPUT_BASE_DIR)}")
    print(f"\n‚úÖ Ready for TTS training!")

if __name__ == "__main__":
    main()

üéØ CREATING LJSPEECH FORMAT DATASET

üîç Checking directories...
‚úÖ Found 8376 audio files in: C:\Users\kriti\OneDrive\Pictures\nepali_tts\preprocessed_audio\wavs
‚úÖ Output directory: C:\Users\kriti\OneDrive\Pictures\nepali_tts\dataset

Processing TRAIN split
üìñ Loading: C:\Users\kriti\OneDrive\Pictures\nepali_tts\preprocessed_audio\metadata\splits\train_metadata.csv
‚úÖ Loaded 6082 entries
üìã Processing audio files...


train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6082/6082 [01:26<00:00, 70.33it/s]



üíæ Saving metadata to: C:\Users\kriti\OneDrive\Pictures\nepali_tts\dataset\ljspeech_train\metadata.csv

üìä TRAIN SUMMARY:
   ‚Ä¢ Entries in manifest: 6082
   ‚Ä¢ Files copied: 6082
   ‚Ä¢ Files skipped: 0
   ‚Ä¢ Success rate: 100.0%
   ‚Ä¢ Total duration: 441.98 minutes
   ‚Ä¢ Average duration: 4.36 seconds

Processing VAL split
üìñ Loading: C:\Users\kriti\OneDrive\Pictures\nepali_tts\preprocessed_audio\metadata\splits\val_metadata.csv
‚úÖ Loaded 1065 entries
üìã Processing audio files...


val: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1065/1065 [00:14<00:00, 72.73it/s]



üíæ Saving metadata to: C:\Users\kriti\OneDrive\Pictures\nepali_tts\dataset\ljspeech_val\metadata.csv

üìä VAL SUMMARY:
   ‚Ä¢ Entries in manifest: 1065
   ‚Ä¢ Files copied: 1065
   ‚Ä¢ Files skipped: 0
   ‚Ä¢ Success rate: 100.0%
   ‚Ä¢ Total duration: 64.00 minutes
   ‚Ä¢ Average duration: 3.61 seconds

Processing TEST split
üìñ Loading: C:\Users\kriti\OneDrive\Pictures\nepali_tts\preprocessed_audio\metadata\splits\test_metadata.csv
‚úÖ Loaded 1229 entries
üìã Processing audio files...


test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1229/1229 [00:19<00:00, 62.58it/s]


üíæ Saving metadata to: C:\Users\kriti\OneDrive\Pictures\nepali_tts\dataset\ljspeech_test\metadata.csv

üìä TEST SUMMARY:
   ‚Ä¢ Entries in manifest: 1229
   ‚Ä¢ Files copied: 1229
   ‚Ä¢ Files skipped: 0
   ‚Ä¢ Success rate: 100.0%
   ‚Ä¢ Total duration: 79.81 minutes
   ‚Ä¢ Average duration: 3.90 seconds

üíæ SAVING DATASET STATISTICS
‚úÖ Statistics saved: C:\Users\kriti\OneDrive\Pictures\nepali_tts\dataset\dataset_statistics.json
‚úÖ README created: C:\Users\kriti\OneDrive\Pictures\nepali_tts\dataset\README.txt

üéâ LJSPEECH DATASET CREATION COMPLETE!

üìä SUMMARY:
   ‚Ä¢ Total samples: 8376
   ‚Ä¢ Total duration: 9.76 hours
   ‚Ä¢ train: 6082 files, 7.37 hours
   ‚Ä¢ val: 1065 files, 1.07 hours
   ‚Ä¢ test: 1229 files, 1.33 hours

üìÅ Dataset location: C:\Users\kriti\OneDrive\Pictures\nepali_tts\dataset

‚úÖ Ready for TTS training!



