# Demo de inferencia multimodal

Este cuaderno muestra cómo preparar las entradas multimodales y ejecutar el `MultimodalPredictor` con el modelo de demostración incluido en el esqueleto.

In [None]:
from pathlib import Path
import json
import numpy as np
from PIL import Image

ROOT = Path('demo_assets')
NAME = 'sample_0001'

# Crear la jerarquía esperada
for sub in [
    'data/images',
    'data/maps/normal',
    'data/maps/roughness',
    'data/maps/specular',
    'data/maps/emissive',
    'data/maps/metalness',
    'data/meta',
]:
    (ROOT / sub).mkdir(parents=True, exist_ok=True)

rng = np.random.default_rng(42)
size = (480, 640)  # (H, W)

def save_rgb(path, array):
    Image.fromarray(array.astype(np.uint8)).save(path)

rgb = rng.integers(0, 255, size + (3,), dtype=np.uint8)
normal = rng.integers(0, 255, size + (3,), dtype=np.uint8)
rough = rng.integers(0, 255, size, dtype=np.uint8)
spec = rng.integers(0, 255, size, dtype=np.uint8)
emiss = rng.integers(0, 255, size, dtype=np.uint8)
metal = rng.integers(0, 255, size, dtype=np.uint8)

save_rgb(ROOT / 'data/images' / f'{NAME}.png', rgb)
save_rgb(ROOT / 'data/maps/normal' / f'{NAME}_n.png', normal)
Image.fromarray(rough).save(ROOT / 'data/maps/roughness' / f'{NAME}_r.png')
Image.fromarray(spec).save(ROOT / 'data/maps/specular' / f'{NAME}_s.png')
Image.fromarray(emiss).save(ROOT / 'data/maps/emissive' / f'{NAME}_e.png')
Image.fromarray(metal).save(ROOT / 'data/maps/metalness' / f'{NAME}_m.png')

meta = {
    'luminance_lab': 62.0,
    'saturation': 0.6,
    'contrast': 35.0,
    'dominant_colors': [
        [220, 180, 40],
        [15, 100, 200],
    ],
}

with open(ROOT / 'data/meta' / f'{NAME}.json', 'w', encoding='utf-8') as fh:
    json.dump(meta, fh, indent=2, ensure_ascii=False)

ROOT, NAME

In [None]:
import torch
from src.model.model import MultimodalYoloStub
from src.infer import MultimodalPredictor

# Configuración coherente con los datos sintéticos generados arriba
config = {
    'imgsz': 512,
    'use_metalness': True,
    'use_coordconv': True,
    'mask_threshold': 0.5,
    'score_threshold': 0.1,
}

# Calcular canales de entrada según la configuración
base_channels = 3 + 3 + 1 + 1 + 1  # RGB + normales + mapas escalares
if config['use_metalness']:
    base_channels += 1
if config['use_coordconv']:
    base_channels += 2

num_classes = 4
context_dim = 18  # 3 stats + 5 colores (RGB) => 3 + 15

model = MultimodalYoloStub(
    in_channels=base_channels,
    num_classes=num_classes,
    dim_context=context_dim,
    base_channels=16,
    neck_channels=32,
)

predictor = MultimodalPredictor(model, cfg=config)

predictor

In [None]:
prediction = predictor.predict(str(ROOT), NAME)
print('Clases detectadas:', prediction['classes'])
print('Scores:', [round(s, 3) for s in prediction['scores']])
print('Número de máscaras:', len(prediction['masks']))