In [None]:
import pandas as pd
import os
import json

def create_validated_json_datasets(folder_path="."):
    
    validated_path = os.path.join(folder_path, "validated.tsv")
    
    if not os.path.exists(validated_path):
        print(f"Файл validated.tsv не найден в папке '{folder_path}'")
        return
        
    try:
        df = pd.read_csv(validated_path, sep='\t')
                train_data = []
        val_data = []
        
        # Разделение данных: 10% валидационная выборка
        train_size = int(len(df) * 0.9)
        
        # Тренировочные данные
        for i in range(train_size):
            train_data.append({
                "audio_path": os.path.join(folder_path, 'clips', df.iloc[i]['path']),
                "transcript": df.iloc[i]['sentence']
            })
        
        # Валидационные данные
        for i in range(train_size, len(df)):
            val_data.append({
                "audio_path": os.path.join(folder_path, 'clips', df.iloc[i]['path']),
                "transcript": df.iloc[i]['sentence']
            })
        
        # Сохраняем train JSON
        with open('train.json', 'w', encoding='utf-8') as f:
            json.dump(train_data, f, ensure_ascii=False, indent=4)
        
        # Сохраняем validation JSON  
        with open('val.json', 'w', encoding='utf-8') as f:
            json.dump(val_data, f, ensure_ascii=False, indent=4)
        
        print(f"Записей в тренировочных данных: {len(train_data):,}")
        print(f"Записей в валидационных данных: {len(val_data):,}")
        
        print("\nПримеры из train.json:")
        for i in range(min(2, len(train_data))):
            print(f"   {json.dumps(train_data[i], ensure_ascii=False)}")
        
        print("\nПримеры из val.json:")
        for i in range(min(2, len(val_data))):
            print(f"   {json.dumps(val_data[i], ensure_ascii=False)}")
        
        return train_data, val_data
        
    except Exception as e:
        print(f" Ошибка при создании датасетов: {e}")

create_validated_json_datasets(folder_path="hsb")  