In [None]:
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
from point_e.models.fusion import TextImageFusionModule
from point_e.models.multimodal import MultimodalPointDiffusionTransformer
from point_e.models.configs import MODEL_CONFIGS
from point_e.models.download import load_checkpoint

from point_e.diffusion.sampler import PointCloudSampler
from point_e.diffusion.configs import DIFFUSION_CONFIGS
from point_e.models.configs import MODEL_CONFIGS, model_from_config
from point_e.models.download import load_checkpoint
from point_e.util.plotting import plot_point_cloud

from torchvision import transforms

# CLIP preprocessing transform
def get_clip_transform():
    return transforms.Compose([
        transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), 
                             (0.26862954, 0.26130258, 0.27577711))
    ])

def main():
    # Setup device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Create the base multimodal model
    model_config = MODEL_CONFIGS['base40M'].copy()
    base_model = MultimodalPointDiffusionTransformer(
        device=device,
        **model_config
    )
    base_diffusion = diffusion_from_config(DIFFUSION_CONFIGS['base40M'])
    
    # Create the upsampler model (original Point-E)
    upsampler_model = model_from_config(MODEL_CONFIGS['upsample'], device)
    upsampler_diffusion = diffusion_from_config(DIFFUSION_CONFIGS['upsample'])
    
    # Load checkpoints
    base_model.load_state_dict(torch.load("multimodal_point_e_final.pt", map_location=device))
    upsampler_model.load_state_dict(load_checkpoint('upsample', device))
    
    # Set models to eval mode
    base_model.eval()
    upsampler_model.eval()
    
    # Create sampler
    sampler = PointCloudSampler(
        device=device,
        models=[base_model, upsampler_model],
        diffusions=[base_diffusion, upsampler_diffusion],
        num_points=[1024, 4096 - 1024],
        aux_channels=['R', 'G', 'B'],
        guidance_scale=[3.0, 3.0],
    )
    
    # Prepare inputs
    transform = get_clip_transform()
    
    # Load and preprocess image
    image_path = "example_image.jpg"
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    
    # Text prompt
    text_prompt = "a red motorcycle with chrome details"
    
    # Generate point cloud
    print("Generating point cloud...")
    samples = None
    with torch.no_grad():
        for x in tqdm(sampler.sample_batch_progressive(
            batch_size=1, 
            model_kwargs=dict(images=image, texts=[text_prompt])
        )):
            samples = x
    
    # Convert to point cloud
    pc = sampler.output_to_point_clouds(samples)[0]
    
    # Visualize
    fig = plot_point_cloud(pc, grid_size=1)
    fig.savefig("multimodal_output.png")
    
    # Save point cloud
    pc.save("multimodal_output.npz")
    
    print(f"Generated point cloud saved to multimodal_output.npz")
    print(f"Visualization saved to multimodal_output.png")


if __name__ == "__main__":
    main()

In [None]:


# 测试融合模块
def test_fusion_module():
    print("Testing TextImageFusionModule...")
    fusion = TextImageFusionModule(clip_dim=768, fusion_dim=512)
    
    # 创建随机输入
    batch_size = 2
    text_emb = torch.randn(batch_size, 768)
    img_tokens = torch.randn(batch_size, 196, 768)  # 196 = 14x14 grid
    
    # 前向传递
    out = fusion(text_emb, img_tokens)
    
    print(f"Input shapes: text_emb {text_emb.shape}, img_tokens {img_tokens.shape}")
    print(f"Output shape: {out.shape}")
    print("Fusion module test passed!")
    
# 测试多模态模型
def test_multimodal_model():
    print("Testing MultimodalPointDiffusionTransformer...")
    device = torch.device('cpu')  # 使用CPU以便调试
    
    # 创建模型
    model_config = MODEL_CONFIGS['base40M'].copy()
    try:
        model = MultimodalPointDiffusionTransformer(
            device=device,
            dtype=torch.float32,
            **model_config
        )
        print("Model initialization successful!")
        
        # 测试前向传递
        batch_size = 2
        x = torch.randn(batch_size, 6, 1024)  # [B, C, n_ctx]
        t = torch.randint(0, 1000, (batch_size,))
        
        # 创建随机文本和图像
        from point_e.models.pretrained_clip import ImageCLIP
        dummy_img = torch.randn(batch_size, 3, 224, 224)
        dummy_texts = ["a red chair", "a blue table"]
        
        print("Running forward pass...")
        out = model(x, t, images=[dummy_img]*batch_size, texts=dummy_texts)
        
        print(f"Input shapes: x {x.shape}, t {t.shape}")
        print(f"Output shape: {out.shape}")
        print("Multimodal model test passed!")
        
    except Exception as e:
        print(f"Error in model test: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    test_fusion_module()
    test_multimodal_model()

In [None]:
import torch
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from point_e.diffusion.configs import DIFFUSION_CONFIGS, diffusion_from_config

def test_training_loop():
    print("Testing training loop...")
    device = torch.device('cpu')  # Use CPU for debugging
    
    # Create model (reuse from previous test)
    from point_e.models.multimodal import MultimodalPointDiffusionTransformer
    from point_e.models.configs import MODEL_CONFIGS
    
    model_config = MODEL_CONFIGS['base40M'].copy()
    model = MultimodalPointDiffusionTransformer(
        device=device,
        dtype=torch.float32,
        **model_config
    )
    diffusion = diffusion_from_config(DIFFUSION_CONFIGS['base40M'])
    
    # Freeze all parameters first, then unfreeze fusion module
    for param in model.parameters():
        param.requires_grad = False
    
    for name, param in model.named_parameters():
        if "fusion_module" in name or "clip_embed" in name:
            param.requires_grad = True
    
    # Setup optimizer (only for trainable parameters)
    optimizer = optim.AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=1e-4,
    )
    
    # Create dummy batch
    batch_size = 2
    images = torch.randn(batch_size, 3, 224, 224)
    texts = ["a red chair", "a blue table"]
    point_clouds = torch.randn(batch_size, 6, 1024)  # [B, C, n_ctx]
    
    # Training step
    model.train()
    t = torch.randint(0, diffusion.num_timesteps, (batch_size,), device=device)
    model_kwargs = {"images": images, "texts": texts}
    
    try:
        # Compute training loss
        losses = diffusion.training_losses(
            model=model,
            x_start=point_clouds,
            t=t,
            model_kwargs=model_kwargs
        )
        loss = losses["loss"].mean()
        
        # Backprop and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f"Training loss: {loss.item()}")
        print("Training loop test passed!")
        
    except Exception as e:
        print(f"Error in training loop test: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    test_training_loop()

In [None]:
import torch
from tqdm import tqdm

def test_generation():
    print("Testing point cloud generation...")
    device = torch.device('cpu')  # Use CPU for debugging
    
    # Create models
    from point_e.diffusion.sampler import PointCloudSampler
    from point_e.diffusion.configs import DIFFUSION_CONFIGS
    from point_e.models.configs import MODEL_CONFIGS, model_from_config
    from point_e.models.multimodal import MultimodalPointDiffusionTransformer
    
    # Create the base multimodal model
    model_config = MODEL_CONFIGS['base40M'].copy()
    base_model = MultimodalPointDiffusionTransformer(
        device=device,
        **model_config
    )
    base_diffusion = diffusion_from_config(DIFFUSION_CONFIGS['base40M'])
    
    # Create the upsampler model (original Point-E)
    upsampler_model = model_from_config(MODEL_CONFIGS['upsample'], device)
    upsampler_diffusion = diffusion_from_config(DIFFUSION_CONFIGS['upsample'])
    
    # Set models to eval mode
    base_model.eval()
    upsampler_model.eval()
    
    # Create sampler
    sampler = PointCloudSampler(
        device=device,
        models=[base_model, upsampler_model],
        diffusions=[base_diffusion, upsampler_diffusion],
        num_points=[1024, 4096 - 1024],
        aux_channels=['R', 'G', 'B'],
        guidance_scale=[3.0, 3.0],
    )
    
    # Create dummy inputs
    image = torch.randn(1, 3, 224, 224)
    text_prompt = "a red chair"
    
    try:
        # Generate point cloud (with limited steps for testing)
        print("Generating point cloud...")
        with torch.no_grad():
            # Use a very small number of steps for testing
            base_diffusion.num_timesteps = 10
            upsampler_diffusion.num_timesteps = 10
            
            samples = None
            for x in sampler.sample_batch_progressive(
                batch_size=1, 
                model_kwargs=dict(images=[image], texts=[text_prompt])
            ):
                samples = x
        
        print("Generation test passed!")
        
    except Exception as e:
        print(f"Error in generation test: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    test_generation()