In [None]:
import json
from io import BytesIO
from typing import Any, Dict, List, Tuple, Union

import pandas as pd
from datasets import load_dataset
from PIL import Image
from sklearn.model_selection import train_test_split


# Helpers

def _to_pil_image(image_data: Any) -> Union[Image.Image, None]:
    """
    Преобразует данные изображения в объект PIL.Image. 
    Возвращает None для некорректных/пустых данных.
    """
    if image_data is None:
        return None
        
    try:
        if isinstance(image_data, Image.Image):
            return image_data
        elif isinstance(image_data, bytes):
            return Image.open(BytesIO(image_data))
        # Если в датасете есть поле "bytes" внутри словаря (как в ruVQA)
        elif isinstance(image_data, dict) and "bytes" in image_data and isinstance(image_data["bytes"], bytes):
            return Image.open(BytesIO(image_data["bytes"]))
    except Exception as e:
        print(f"Не удалось открыть изображение: {e}")
        return None
        
    return None
    
def _split_dataset(entries: List[Dict[str, Any]],
                   split_ratio: Tuple[float, float, float]) -> Dict[str, List[Dict[str, Any]]]:
    train_ratio, val_ratio, test_ratio = split_ratio
    train_val_ratio = train_ratio + val_ratio
    
    # Чтобы избежать ошибки при пустом списке
    if not entries:
        return {"train": [], "val": [], "test": []}

    train_val, test = train_test_split(entries, test_size=test_ratio, random_state=42)
    
    # Проверка, что train_val не пуст перед вторым разделением
    if not train_val:
        return {"train": [], "val": [], "test": test}
        
    train, val = train_test_split(train_val, test_size=val_ratio / train_val_ratio, random_state=42)

    return {"train": train, "val": val, "test": test}
    
# Loaders

def load_docmatix_limited(
    max_final_samples: int = 15000,
    max_source_documents: int = 20,
    split_ratio: Tuple[float, float, float] = (0.7, 0.15, 0.15),
    cache_dir: str = None
) -> Dict[str, List[Dict[str, Any]]]:
    """
    Загружает ограниченный поднабор Docmatix и сохраняет ТОЛЬКО документы с ОДНОЙ картинкой.
    Каждая запись включает:
      {
        "image": PIL.Image.Image,  # <-- ОДНА картинка (не список!)
        "question": str,
        "answer": str,             # <-- переименовано из answers
        "source": "Docmatix"
      }
    """
    load_kwargs = {"cache_dir": cache_dir} if cache_dir else {}

    ds = load_dataset(
        "HuggingFaceM4/Docmatix",
        "images",
        split="train",
        streaming=True,
        **load_kwargs
    )

    entries = []
    doc_count = 0
    single_image_doc_count = 0 

    for item in ds:
        if doc_count >= max_source_documents or len(entries) >= max_final_samples:
            break

        raw_images = item.get("images", [])
        texts = item.get("texts", [])

        if not raw_images or len(raw_images) != 1 or not texts:
            continue

        image = _to_pil_image(raw_images[0])
        if not image:
            continue

        doc_count += 1
        single_image_doc_count += 1

        for qa in texts:
            if len(entries) >= max_final_samples:
                break

            question = qa.get("user", "").strip()
            answer = qa.get("assistant", "").strip()
            if not question or not answer:
                continue

            entries.append({
                "image": image,      
                "question": question,
                "answer": answer,     
                "source": "Docmatix",
            })

    print(f" Загружено {len(entries)} примеров из {single_image_doc_count} документов Docmatix с одной картинкой.")
    return _split_dataset(entries, split_ratio)

def load_ruclevr(split_ratio: Tuple[float, float, float] = (0.7, 0.15, 0.15),
                 cache_dir: str = None) -> Dict[str, List[Dict[str, Any]]]:
    load_kwargs = {"cache_dir": cache_dir} if cache_dir else {}
    ds_dict = load_dataset("MERA-evaluation/ruCLEVR", **load_kwargs)
    datasets_list = ds_dict.values() if isinstance(ds_dict, dict) else [ds_dict]

    entries = []
    for ds in datasets_list:
        for item in ds:
            try:
                inputs = item.get("inputs")
                outputs = item.get("outputs", "")

                if isinstance(inputs, str):
                    inputs = json.loads(inputs)
                if not isinstance(inputs, dict):
                    continue

                question = inputs.get("question", "")
                
                image_info = inputs.get("image", {})
                image_bytes = image_info.get("bytes") if isinstance(image_info, dict) else None
                image = _to_pil_image(image_bytes)

                if not image: 
                    continue

                entries.append({
                    "image": image,      
                    "question": question,
                    "answer": str(outputs), 
                    "source": "ruCLEVR",
                })
            except Exception as e:
                print(f"Ошибка при разборе ruCLEVR item: {e}")

    return _split_dataset(entries, split_ratio)

def load_ruvqa(split_ratio: Tuple[float, float, float] = (0.7, 0.15, 0.15),
               cache_dir: str = None) -> Dict[str, List[Dict[str, Any]]]:
    load_kwargs = {"cache_dir": cache_dir} if cache_dir else {}
    ds_dict = load_dataset("MERA-evaluation/ruVQA", **load_kwargs)
    datasets_list = ds_dict.values() if isinstance(ds_dict, dict) else [ds_dict]

    entries = []
    for ds in datasets_list:
        for item in ds:
            try:
                inputs = item.get("inputs")
                outputs = item.get("outputs", "")

                if isinstance(inputs, str):
                    inputs = json.loads(inputs)
                if not isinstance(inputs, dict):
                    continue

                question = inputs.get("question", "")

                image_info = inputs.get("image", {})
                image_bytes = image_info.get("bytes") if isinstance(image_info, dict) else None
                image = _to_pil_image(image_bytes)

                if not image:
                    continue

                entries.append({
                    "image": image,     
                    "question": question,
                    "answer": str(outputs), 
                    "source": "ruVQA",
                })
            except Exception as e:
                print(f"Ошибка при разборе ruVQA item: {e}")

    return _split_dataset(entries, split_ratio)


def load_mmbench_ru(
    split_ratio: Tuple[float, float, float] = (0.7, 0.15, 0.15),
    cache_dir: str = None
) -> Dict[str, List[Dict[str, Any]]]:
    load_kwargs = {"cache_dir": cache_dir} if cache_dir else {}
    ds_dict = load_dataset("deepvk/MMBench-ru", **load_kwargs)
    datasets_list = ds_dict.values() if isinstance(ds_dict, dict) else [ds_dict]

    entries = []
    for ds in datasets_list:
        for item in ds:
            try:
                question = item.get("question", "").strip()
                hint = item.get("hint", None)
                if hint and pd.notna(hint) and str(hint).lower() != "nan":
                    question = f"{question} {hint}".strip()

                correct_letter = item.get("answer", None)
                if correct_letter not in ["A", "B", "C", "D"]:
                    continue

                options = [str(item.get(L, "—")).strip() for L in ["A", "B", "C", "D"]]
                options_text = " ; ".join(options)
                question = f"{question}\nВарианты ответа: {options_text}"

                correct_text = item.get(correct_letter, None)
                if not pd.notna(correct_text) or str(correct_text).lower() == "nan":
                    continue
                
                raw_image = item.get("image")
                image = _to_pil_image(raw_image)
                
                if not image:
                    continue

                entries.append({
                    "image": image,      
                    "question": question,
                    "answer": str(correct_text).strip(),  
                    "source": "MMBench-ru",
                })

            except Exception as e:
                print(f"Ошибка при разборе MMBench-ru item: {e}")

    return _split_dataset(entries, split_ratio)


def load_mws_vision_bench(split_ratio: Tuple[float, float, float] = (0.7, 0.15, 0.15),
                          cache_dir: str = None) -> Dict[str, List[Dict[str, Any]]]:
    load_kwargs = {"cache_dir": cache_dir} if cache_dir else {}
    ds_dict = load_dataset("MTSAIR/MWS-Vision-Bench", **load_kwargs)
    datasets_list = ds_dict.values() if isinstance(ds_dict, dict) else [ds_dict]

    entries = []
    for ds in datasets_list:
        for item in ds:
            try:
                answers = item.get("answers", [])
                if not answers:
                    continue
                
            
                raw_image = item.get("image", None)
                image = _to_pil_image(raw_image)

                if not image:
                    continue

                entries.append({
                    "image": image,    
                    "question": item.get("question", "").strip(),
                    "answer": str(answers), 
                    "source": "MWS-Vision-Bench",
                })
            except Exception as e:
                print(f"Ошибка при разборе MWS-Vision-Bench item: {e}")

    return _split_dataset(entries, split_ratio)

In [None]:
from sklearn.model_selection import train_test_split

print("Загрузка ruCLEVR...")
ruclevr_data = load_ruclevr(split_ratio=(0.7, 0.15, 0.15))
print(f"ruCLEVR: train={len(ruclevr_data['train'])}, val={len(ruclevr_data['val'])}, test={len(ruclevr_data['test'])}")

print("Загрузка ruVQA...")
ruvqa_data = load_ruvqa(split_ratio=(0.7, 0.15, 0.15))
print(f"ruVQA: train={len(ruvqa_data['train'])}, val={len(ruvqa_data['val'])}, test={len(ruvqa_data['test'])}")

print("Загрузка MMBench-ru...")
mmbench_data = load_mmbench_ru(split_ratio=(0.7, 0.15, 0.15))
print(f"MMBench-ru: train={len(mmbench_data['train'])}, val={len(mmbench_data['val'])}, test={len(mmbench_data['test'])}")

print("Загрузка MWS-Vision-Bench...")
mws_data = load_mws_vision_bench(split_ratio=(0.7, 0.15, 0.15))
print(f"MWS-Vision-Bench: train={len(mws_data['train'])}, val={len(mws_data['val'])}, test={len(mws_data['test'])}")

print("Загрузка Docmatix...")
docmatrix_dataset = load_docmatix_limited(max_source_documents=20000, split_ratio=(0.7, 0.15, 0.15))
print(f"Docmatix: train={len(docmatrix_dataset['train'])}, val={len(docmatrix_dataset['val'])}, test={len(docmatrix_dataset['test'])}")

In [None]:
docmatrix_dataset['train'][2]

In [None]:
mws_data['train'][1]

In [None]:
ruclevr_data['train'][0]


In [None]:
ruvqa_data['train'][0]


In [None]:
mmbench_data['train'][1]

In [None]:
import json
import base64
from pathlib import Path
from PIL import Image
from io import BytesIO
from tqdm import tqdm

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from typing import Dict, List, Any
from datasets import Features, Image as HFImage, Value, Sequence

def serialize_image_for_hf(image: Image.Image) -> Dict[str, bytes]:
    """
    Сериализует ОДИН объект PIL.Image в формат для datasets.Image()
    """
    if not isinstance(image, Image.Image):
        return {"bytes": b""}
        
    try:
        buffered = BytesIO()
        image.save(buffered, format="PNG")
        return {"bytes": buffered.getvalue()}
    except Exception as e:
        print(f"Не удалось сериализовать изображение: {e}")
        return {"bytes": b""}


def save_to_partitioned_parquet_with_hf_metadata(
    datasets: Dict[str, Dict[str, List[Dict]]],
    output_dir: str = "combined_data_parquet_simple",
    chunk_size: int = 100
):
    root_path = Path(output_dir)
    root_path.mkdir(parents=True, exist_ok=True)
    print(f" Данные будут сохранены в: {root_path.absolute()}")

    features = Features({
        'image': HFImage(decode=True), 
        'question': Value('string'),
        'answer': Value('string'),     
        'source': Value('string')
    })

    for ds_name, dataset_splits in tqdm(datasets.items(), desc="Датасеты", unit="датасет"):
        print(f"\n Обработка датасета: {ds_name}")

        for split_name, records in tqdm(dataset_splits.items(), desc=f"{ds_name} сплиты", unit="сплит", leave=False):
            if not records:
                print(f"  - Пропуск пустого сплита: {split_name}")
                continue

            print(f"  - Подготовка сплита '{split_name}' ({len(records)} записей)")

            split_output_path = root_path / split_name

            # Разбиваем на чанки
            for chunk_start in range(0, len(records), chunk_size):
                chunk_end = min(chunk_start + chunk_size, len(records))
                chunk_records = records[chunk_start:chunk_end]

                prepared_data = []
                for item in chunk_records:
                    prepared_data.append({
                        "image": serialize_image_for_hf(item.get("image")),
                        "question": item.get("question", "").strip(),
                        "answer": item.get("answer", "").strip(),
                        "source": item.get("source", ds_name),
                    })

                if not prepared_data:
                    continue

                df = pd.DataFrame(prepared_data)
                arrow_table = pa.Table.from_pandas(df, schema=features.arrow_schema, preserve_index=False)

                # Сохраняем чанк
                pq.write_to_dataset(
                    arrow_table,
                    root_path=split_output_path,
                    partition_cols=['source'],
                    schema=features.arrow_schema,
                    existing_data_behavior='overwrite_or_ignore'
                )

            print(f"  Сплит '{split_name}' из '{ds_name}' успешно сохранен.")

    print(f"\n Все датасеты успешно сохранены!")

In [None]:
# Собираем датасеты в словарь (ключ — имя, значение — сам датасет)

all_datasets = {
    "ruCLEVR": ruclevr_data,
    "ruVQA": ruvqa_data, 
    "MMBench_ru": mmbench_data,
    "MWS_Vision_Bench": mws_data,
    "Docmatix": docmatrix_dataset
}


save_to_partitioned_parquet_with_hf_metadata(all_datasets, output_dir="combined_data")

In [None]:
import os
import json
from pathlib import Path

def setup_kaggle_auth():
    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        username = user_secrets.get_secret("KAGGLE_USERNAME")
        key = user_secrets.get_secret("KAGGLE_KEY")
    except ImportError:
        print("Используем переменные окружения для Kaggle")
        username = os.environ.get('KAGGLE_USERNAME')
        key = os.environ.get('KAGGLE_KEY')

    if not (username and key):
        raise ValueError("Не найдены учетные данные Kaggle")

    # Создаем конфиг Kaggle
    kaggle_dir = Path.home() / '.kaggle'
    kaggle_dir.mkdir(parents=True, exist_ok=True)
    
    kaggle_json = kaggle_dir / 'kaggle.json'
    kaggle_json.write_text(json.dumps({'username': username, 'key': key}))
    kaggle_json.chmod(0o777)
    
    print("Аутентификация Kaggle настроена. Файл kaggle.json успешно создан и настроен.")
    return username

def create_dataset_metadata(username: str):
    data_dir = Path("/kaggle/working/combined_data")
    
    metadata = {
        "title": "DocVQA-ru-eng-v1",
        "id": f"{username}/docvqa-ru-eng-v1",
        "licenses": [{"name": "MIT"}]
    }
    
    metadata_path = data_dir / "dataset-metadata.json"
    metadata_path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2))
    
    print("Файл метаданных создан/проверен.")
    return data_dir

def upload_to_kaggle(data_dir: Path):
    print("Загрузка датасета на Kaggle...")
    
    result = os.system(f'kaggle datasets create -p "{data_dir}" --dir-mode zip --public')
    
    if result == 0:
        print("Датасет успешно загружен!")
    else:
        print("Ошибка при загрузке датасета")

def main():
    print("Начало загрузки датасета на Kaggle...")
    
    username = setup_kaggle_auth()
    data_dir = create_dataset_metadata(username)
    upload_to_kaggle(data_dir)
    
    print(f"\nСсылка на датасет: https://www.kaggle.com/datasets/{username}/docvqa-ru-eng-v1")
    print("Обработка может занять несколько минут.")

if __name__ == "__main__":
    main()