PHASE 4

In [None]:
#pip install onnx
!pip install onnxruntime-gpu


Collecting onnxruntime-gpu
  Downloading onnxruntime_gpu-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.2 kB)
Collecting coloredlogs (from onnxruntime-gpu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime-gpu)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime_gpu-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (300.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.5/300.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hIn

In [None]:
"""
PHASE 4: MODEL OPTIMIZATION & DEPLOYMENT
Advanced techniques for production deployment

Features:
- Model Quantization (INT8)
- ONNX Export
- TorchScript Compilation
- Model Pruning
- Batch Processing Optimization
- Deployment-ready exports
"""

import torch
import torch.nn as nn
import torch.quantization
import torchvision
import numpy as np
import time
from pathlib import Path
import json
from datetime import datetime
import onnx
import onnxruntime as ort

print("="*80)
print("🚀 PHASE 4: MODEL OPTIMIZATION & DEPLOYMENT")
print("="*80)

# ============================================================================
# PART 1: MODEL QUANTIZATION (INT8)
# ============================================================================

class ModelQuantizer:
    """
    Quantize model to INT8 for faster inference
    Reduces model size by 4x and speeds up inference 2-4x
    """
    def __init__(self, model, device='cuda'):
        self.model = model
        self.device = device

    def quantize_dynamic(self):
        """
        Dynamic quantization (simplest, no calibration needed)
        Good for: CPU deployment, instant speedup
        """
        print("\n🔧 Applying Dynamic Quantization...")

        quantized_model = torch.quantization.quantize_dynamic(
            self.model.cpu(),
            {nn.Linear, nn.Conv2d},
            dtype=torch.qint8
        )

        print("✓ Dynamic quantization applied!")
        return quantized_model

    def prepare_for_quantization_aware_training(self):
        """
        Prepare model for Quantization-Aware Training (QAT)
        Best accuracy, but requires retraining
        """
        print("\n🔧 Preparing for Quantization-Aware Training...")

        model = self.model.cpu()
        model.train()

        # Specify quantization config
        model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')

        # Prepare model
        model_prepared = torch.quantization.prepare_qat(model)

        print("✓ Model prepared for QAT!")
        print("   Train for a few epochs, then call convert()")
        return model_prepared

    def convert_quantized_model(self, prepared_model):
        """Convert QAT model to quantized version"""
        prepared_model.eval()
        quantized_model = torch.quantization.convert(prepared_model)
        print("✓ Model converted to INT8!")
        return quantized_model

    def benchmark_quantization(self, original_model, quantized_model,
                               num_iterations=100):
        """
        Compare original vs quantized model performance
        """
        print("\n📊 Benchmarking Quantization...")

        # Create dummy input
        dummy_input = torch.rand(1, 3, 640, 640)

        # Benchmark original model
        original_model.eval()
        original_model = original_model.cpu()

        original_times = []
        with torch.no_grad():
            # Warmup
            for _ in range(10):
                _ = original_model([dummy_input])

            # Benchmark
            for _ in range(num_iterations):
                start = time.time()
                _ = original_model([dummy_input])
                original_times.append(time.time() - start)

        # Benchmark quantized model
        quantized_model.eval()
        quantized_times = []
        with torch.no_grad():
            # Warmup
            for _ in range(10):
                _ = quantized_model([dummy_input])

            # Benchmark
            for _ in range(num_iterations):
                start = time.time()
                _ = quantized_model([dummy_input])
                quantized_times.append(time.time() - start)

        # Calculate statistics
        original_avg = np.mean(original_times) * 1000  # ms
        quantized_avg = np.mean(quantized_times) * 1000  # ms
        speedup = original_avg / quantized_avg

        # Model sizes
        original_size = sum(p.numel() * p.element_size() for p in original_model.parameters()) / 1e6
        quantized_size = sum(p.numel() * p.element_size() for p in quantized_model.parameters()) / 1e6
        size_reduction = (1 - quantized_size/original_size) * 100

        print(f"\n📊 Quantization Results:")
        print(f"   Original Model:")
        print(f"      Time: {original_avg:.2f}ms")
        print(f"      Size: {original_size:.2f}MB")
        print(f"   Quantized Model:")
        print(f"      Time: {quantized_avg:.2f}ms")
        print(f"      Size: {quantized_size:.2f}MB")
        print(f"   Improvements:")
        print(f"      🚀 Speedup: {speedup:.2f}x")
        print(f"      💾 Size Reduction: {size_reduction:.1f}%")

        return {
            'original_time_ms': original_avg,
            'quantized_time_ms': quantized_avg,
            'speedup': speedup,
            'original_size_mb': original_size,
            'quantized_size_mb': quantized_size,
            'size_reduction_percent': size_reduction
        }

# ============================================================================
# PART 2: ONNX EXPORT
# ============================================================================

class ONNXExporter:
    """
    Export PyTorch model to ONNX format
    ONNX = Cross-platform model format (works everywhere!)
    """
    def __init__(self, model, device='cuda'):
        self.model = model
        self.device = device

    def export_to_onnx(self, output_path="outputs/models/model.onnx",
                       input_shape=(1, 3, 640, 640), opset_version=11):
        """
        Export model to ONNX format

        Args:
            output_path: Where to save ONNX model
            input_shape: Input tensor shape
            opset_version: ONNX opset version
        """
        print(f"\n📦 Exporting to ONNX...")

        Path(output_path).parent.mkdir(parents=True, exist_ok=True)

        self.model.eval()
        self.model = self.model.cpu()

        # Create dummy input
        dummy_input = torch.randn(*input_shape)

        # Export
        torch.onnx.export(
            self.model,
            (dummy_input,),
            output_path,
            export_params=True,
            opset_version=opset_version,
            do_constant_folding=True,
            input_names=['input'],
            output_names=['output'],
            dynamic_axes={
                'input': {0: 'batch_size'},
                'output': {0: 'batch_size'}
            }
        )

        # Verify ONNX model
        onnx_model = onnx.load(output_path)
        onnx.checker.check_model(onnx_model)

        file_size = Path(output_path).stat().st_size / 1e6

        print(f"✓ ONNX export successful!")
        print(f"   File: {output_path}")
        print(f"   Size: {file_size:.2f}MB")
        print(f"   Opset: {opset_version}")

        return output_path

    def benchmark_onnx(self, onnx_path, num_iterations=100):
        """
        Benchmark ONNX Runtime inference
        """
        print(f"\n📊 Benchmarking ONNX Runtime...")

        # Create ONNX Runtime session
        session = ort.InferenceSession(
            onnx_path,
            providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
        )

        # Get input name
        input_name = session.get_inputs()[0].name

        # Create dummy input
        dummy_input = np.random.randn(1, 3, 640, 640).astype(np.float32)

        # Warmup
        for _ in range(10):
            _ = session.run(None, {input_name: dummy_input})

        # Benchmark
        times = []
        for _ in range(num_iterations):
            start = time.time()
            _ = session.run(None, {input_name: dummy_input})
            times.append(time.time() - start)

        avg_time = np.mean(times) * 1000  # ms
        std_time = np.std(times) * 1000
        throughput = 1000.0 / avg_time  # FPS

        print(f"✓ ONNX Runtime Benchmark:")
        print(f"   Average Time: {avg_time:.2f}ms (±{std_time:.2f}ms)")
        print(f"   Throughput: {throughput:.2f} FPS")

        return {
            'avg_time_ms': avg_time,
            'std_time_ms': std_time,
            'throughput_fps': throughput
        }

# ============================================================================
# PART 3: TORCHSCRIPT COMPILATION
# ============================================================================

class TorchScriptCompiler:
    """
    Compile model to TorchScript for optimized deployment
    TorchScript = Optimized, portable PyTorch model format
    """
    def __init__(self, model, device='cuda'):
        self.model = model
        self.device = device

    def compile_trace(self, input_shape=(1, 3, 640, 640)):
        """
        Compile using tracing (records operations)
        Best for: Models without control flow
        """
        print("\n🔧 Compiling with TorchScript (Trace)...")

        self.model.eval()
        dummy_input = [torch.randn(*input_shape).to(self.device)]

        # Trace the model
        with torch.no_grad():
            traced_model = torch.jit.trace(self.model, dummy_input)

        # Optimize
        traced_model = torch.jit.optimize_for_inference(traced_model)

        print("✓ TorchScript tracing complete!")
        return traced_model

    def compile_script(self):
        """
        Compile using scripting (analyzes Python code)
        Best for: Models with control flow (if/for statements)
        """
        print("\n🔧 Compiling with TorchScript (Script)...")

        self.model.eval()
        scripted_model = torch.jit.script(self.model)

        print("✓ TorchScript scripting complete!")
        return scripted_model

    def save_torchscript(self, compiled_model, output_path="outputs/models/model_traced.pt"):
        """Save TorchScript model"""
        Path(output_path).parent.mkdir(parents=True, exist_ok=True)

        compiled_model.save(output_path)
        file_size = Path(output_path).stat().st_size / 1e6

        print(f"✓ TorchScript model saved!")
        print(f"   File: {output_path}")
        print(f"   Size: {file_size:.2f}MB")

        return output_path

    def benchmark_torchscript(self, original_model, compiled_model,
                             num_iterations=100):
        """Compare original vs TorchScript model"""
        print("\n📊 Benchmarking TorchScript...")

        dummy_input = [torch.randn(1, 3, 640, 640).to(self.device)]

        # Benchmark original
        original_model.eval()
        original_times = []
        with torch.no_grad():
            for _ in range(10):
                _ = original_model(dummy_input)

            for _ in range(num_iterations):
                torch.cuda.synchronize()
                start = time.time()
                _ = original_model(dummy_input)
                torch.cuda.synchronize()
                original_times.append(time.time() - start)

        # Benchmark compiled
        compiled_times = []
        with torch.no_grad():
            for _ in range(10):
                _ = compiled_model(dummy_input)

            for _ in range(num_iterations):
                torch.cuda.synchronize()
                start = time.time()
                _ = compiled_model(dummy_input)
                torch.cuda.synchronize()
                compiled_times.append(time.time() - start)

        original_avg = np.mean(original_times) * 1000
        compiled_avg = np.mean(compiled_times) * 1000
        speedup = original_avg / compiled_avg

        print(f"\n📊 TorchScript Results:")
        print(f"   Original: {original_avg:.2f}ms")
        print(f"   TorchScript: {compiled_avg:.2f}ms")
        print(f"   🚀 Speedup: {speedup:.2f}x")

        return {
            'original_time_ms': original_avg,
            'compiled_time_ms': compiled_avg,
            'speedup': speedup
        }

# ============================================================================
# PART 4: BATCH PROCESSING OPTIMIZER
# ============================================================================

class BatchProcessingOptimizer:
    """
    Optimize batch processing for maximum throughput
    """
    def __init__(self, model, device='cuda'):
        self.model = model.to(device)
        self.model.eval()
        self.device = device

    def find_optimal_batch_size(self, input_shape=(3, 640, 640),
                                max_batch_size=32, num_iterations=50):
        """
        Find optimal batch size for maximum throughput
        """
        print("\n🔍 Finding Optimal Batch Size...")

        results = {}

        for batch_size in [1, 2, 4, 8, 16, 32]:
            if batch_size > max_batch_size:
                break

            try:
                dummy_input = [torch.randn(*input_shape).to(self.device)
                              for _ in range(batch_size)]

                # Warmup
                with torch.no_grad():
                    for _ in range(5):
                        _ = self.model(dummy_input)

                # Benchmark
                times = []
                with torch.no_grad():
                    for _ in range(num_iterations):
                        torch.cuda.synchronize()
                        start = time.time()
                        _ = self.model(dummy_input)
                        torch.cuda.synchronize()
                        times.append(time.time() - start)

                avg_time = np.mean(times)
                throughput = batch_size / avg_time
                latency = avg_time / batch_size
                memory = torch.cuda.max_memory_allocated() / 1e9

                results[batch_size] = {
                    'avg_time': avg_time,
                    'throughput': throughput,
                    'latency_per_image': latency,
                    'memory_gb': memory
                }

                print(f"   Batch {batch_size}: {throughput:.1f} img/s, "
                      f"{latency*1000:.1f}ms/img, {memory:.2f}GB")

                torch.cuda.reset_peak_memory_stats()

            except RuntimeError as e:
                if "out of memory" in str(e):
                    print(f"   Batch {batch_size}: Out of memory!")
                    break
                else:
                    raise e

        # Find optimal
        optimal_batch = max(results.keys(),
                           key=lambda k: results[k]['throughput'])

        print(f"\n✓ Optimal Batch Size: {optimal_batch}")
        print(f"   Throughput: {results[optimal_batch]['throughput']:.1f} img/s")

        return results, optimal_batch

    def process_batch_efficiently(self, images, batch_size=8):
        """
        Process list of images in optimized batches
        """
        results = []

        for i in range(0, len(images), batch_size):
            batch = images[i:i+batch_size]

            with torch.no_grad():
                batch_results = self.model(batch)

            results.extend(batch_results)

        return results

# ============================================================================
# PART 5: MODEL PRUNING
# ============================================================================

class ModelPruner:
    """
    Prune model to reduce size and increase speed
    Removes less important weights
    """
    def __init__(self, model, device='cuda'):
        self.model = model.to(device)
        self.device = device

    def prune_model(self, pruning_amount=0.3):
        """
        Apply structured pruning to model

        Args:
            pruning_amount: Fraction of weights to prune (0.3 = 30%)
        """
        print(f"\n✂️ Pruning model ({pruning_amount*100:.0f}% of weights)...")

        import torch.nn.utils.prune as prune

        # Count original parameters
        original_params = sum(p.numel() for p in self.model.parameters())

        # Prune all Conv2d and Linear layers
        for name, module in self.model.named_modules():
            if isinstance(module, (nn.Conv2d, nn.Linear)):
                prune.l1_unstructured(module, name='weight', amount=pruning_amount)
                prune.remove(module, 'weight')

        # Count remaining parameters
        remaining_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        pruned_params = original_params - remaining_params

        print(f"✓ Pruning complete!")
        print(f"   Original parameters: {original_params:,}")
        print(f"   Pruned parameters: {pruned_params:,}")
        print(f"   Reduction: {(pruned_params/original_params)*100:.1f}%")

        return self.model

# ============================================================================
# PART 6: DEPLOYMENT PACKAGE CREATOR
# ============================================================================

class DeploymentPackager:
    """
    Create production-ready deployment package
    """
    def __init__(self, model, device='cuda'):
        self.model = model
        self.device = device

    def create_deployment_package(self, output_dir="outputs/deployment"):
        """
        Create complete deployment package with all formats
        """
        print("\n📦 Creating Deployment Package...")

        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        package_info = {
            'created_at': datetime.now().isoformat(),
            'formats': {}
        }

        # 1. Save PyTorch model
        print("\n1️⃣ Saving PyTorch model...")
        pytorch_path = output_path / "model.pth"
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'model_architecture': str(type(self.model).__name__)
        }, pytorch_path)
        package_info['formats']['pytorch'] = str(pytorch_path)
        print(f"   ✓ Saved: {pytorch_path}")

        # 2. Export to ONNX
        print("\n2️⃣ Exporting to ONNX...")
        try:
            exporter = ONNXExporter(self.model, self.device)
            onnx_path = exporter.export_to_onnx(
                output_path=str(output_path / "model.onnx")
            )
            package_info['formats']['onnx'] = onnx_path
        except Exception as e:
            print(f"   ⚠️ ONNX export failed: {e}")

        # 3. Compile to TorchScript
        print("\n3️⃣ Compiling to TorchScript...")
        try:
            compiler = TorchScriptCompiler(self.model, self.device)
            traced_model = compiler.compile_trace()
            torchscript_path = compiler.save_torchscript(
                traced_model,
                output_path=str(output_path / "model_traced.pt")
            )
            package_info['formats']['torchscript'] = torchscript_path
        except Exception as e:
            print(f"   ⚠️ TorchScript compilation failed: {e}")

        # 4. Create README
        print("\n4️⃣ Creating documentation...")
        readme_content = self._generate_readme(package_info)
        readme_path = output_path / "README.md"
        with open(readme_path, 'w') as f:
            f.write(readme_content)
        print(f"   ✓ Saved: {readme_path}")

        # 5. Save package info
        info_path = output_path / "package_info.json"
        with open(info_path, 'w') as f:
            json.dump(package_info, f, indent=4)
        print(f"   ✓ Saved: {info_path}")

        print(f"\n✅ Deployment package created: {output_path}")

        return package_info

    def _generate_readme(self, package_info):
        """Generate README for deployment package"""
        return f"""# Object Detection Model - Deployment Package

## Created: {package_info['created_at']}

## Available Formats

### PyTorch (.pth)
- File: `model.pth`
- Usage:
```python
import torch
checkpoint = torch.load('model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
```

### ONNX (.onnx)
- File: `model.onnx`
- Usage:
```python
import onnxruntime as ort
session = ort.InferenceSession('model.onnx')
output = session.run(None, {{'input': input_data}})
```

### TorchScript (.pt)
- File: `model_traced.pt`
- Usage:
```python
import torch
model = torch.jit.load('model_traced.pt')
output = model(input_tensor)
```

## Model Specifications

- **Input**: RGB images, 640x640 pixels
- **Output**: Bounding boxes, labels, confidence scores
- **Classes**: 90 COCO object categories

## Quick Start

```python
# Load model
import torch
model = torch.jit.load('model_traced.pt')
model.eval()

# Run inference
with torch.no_grad():
    predictions = model([image_tensor])
```

## Performance Notes

- Optimized for GPU inference
- Supports batch processing
- Use batch size 4-8 for best throughput

## Requirements

- PyTorch >= 1.13
- torchvision >= 0.14
- CUDA >= 11.0 (for GPU)

"""

# ============================================================================
# PART 7: COMPLETE PHASE 4 DEMO
# ============================================================================

def run_phase4_complete(model, device='cuda'):
    """
    Run complete Phase 4 optimization and deployment
    """
    print("\n" + "="*80)
    print("🚀 PHASE 4: COMPLETE OPTIMIZATION & DEPLOYMENT")
    print("="*80)

    results = {
        'timestamp': datetime.now().isoformat(),
        'optimizations': {}
    }

    # 1. Batch Size Optimization
    print("\n" + "="*80)
    print("STEP 1: Batch Size Optimization")
    print("="*80)
    optimizer = BatchProcessingOptimizer(model, device)
    batch_results, optimal_batch = optimizer.find_optimal_batch_size()
    results['optimizations']['batch_optimization'] = {
        'results': batch_results,
        'optimal_batch_size': optimal_batch
    }

    # 2. TorchScript Compilation
    print("\n" + "="*80)
    print("STEP 2: TorchScript Compilation")
    print("="*80)
    compiler = TorchScriptCompiler(model, device)
    try:
        traced_model = compiler.compile_trace()
        torchscript_results = compiler.benchmark_torchscript(
            model, traced_model, num_iterations=50
        )
        results['optimizations']['torchscript'] = torchscript_results
    except Exception as e:
        print(f"⚠️ TorchScript compilation failed: {e}")
        results['optimizations']['torchscript'] = {'error': str(e)}

    # 3. ONNX Export
    print("\n" + "="*80)
    print("STEP 3: ONNX Export")
    print("="*80)
    exporter = ONNXExporter(model, device)
    try:
        onnx_path = exporter.export_to_onnx()
        onnx_results = exporter.benchmark_onnx(onnx_path, num_iterations=50)
        results['optimizations']['onnx'] = onnx_results
    except Exception as e:
        print(f"⚠️ ONNX export failed: {e}")
        results['optimizations']['onnx'] = {'error': str(e)}

    # 4. Create Deployment Package
    print("\n" + "="*80)
    print("STEP 4: Creating Deployment Package")
    print("="*80)
    packager = DeploymentPackager(model, device)
    package_info = packager.create_deployment_package()
    results['deployment_package'] = package_info

    # 5. Save comprehensive report
    print("\n" + "="*80)
    print("STEP 5: Saving Performance Report")
    print("="*80)
    report_path = "outputs/metrics/phase4_optimization_report.json"
    with open(report_path, 'w') as f:
        json.dump(results, f, indent=4, default=str)
    print(f"✓ Report saved: {report_path}")

    # Final Summary
    print("\n" + "="*80)
    print("✅ PHASE 4 COMPLETE!")
    print("="*80)

    print("\n📊 Optimization Summary:")
    print(f"   Optimal Batch Size: {optimal_batch}")
    if 'torchscript' in results['optimizations'] and 'speedup' in results['optimizations']['torchscript']:
        print(f"   TorchScript Speedup: {results['optimizations']['torchscript']['speedup']:.2f}x")

    print("\n📁 Deployment Package:")
    print("   Location: outputs/deployment/")
    print("   Formats: PyTorch, ONNX, TorchScript")
    print("   Documentation: README.md included")

    print("\n💡 Next Steps:")
    print("   1. Test deployment package on target hardware")
    print("   2. Integrate into production application")
    print("   3. Set up monitoring and logging")
    print("   4. Deploy to cloud/edge devices")

    return results

# ============================================================================
# QUICK START
# ============================================================================

print("\n✓ Phase 4 loaded!")
print("\nTo run complete optimization:")
print(">>> results = run_phase4_complete(model)")
print("\nFor individual optimizations:")
print(">>> optimizer = BatchProcessingOptimizer(model)")
print(">>> compiler = TorchScriptCompiler(model)")
print(">>> exporter = ONNXExporter(model)")

🚀 PHASE 4: MODEL OPTIMIZATION & DEPLOYMENT

✓ Phase 4 loaded!

To run complete optimization:
>>> results = run_phase4_complete(model)

For individual optimizations:
>>> optimizer = BatchProcessingOptimizer(model)
>>> compiler = TorchScriptCompiler(model)
>>> exporter = ONNXExporter(model)


In [None]:
results = run_phase4_complete(model)



🚀 PHASE 4: COMPLETE OPTIMIZATION & DEPLOYMENT

STEP 1: Batch Size Optimization

🔍 Finding Optimal Batch Size...
   Batch 1: 11.5 img/s, 86.7ms/img, 0.79GB
   Batch 2: 10.8 img/s, 92.5ms/img, 0.72GB
   Batch 4: 11.5 img/s, 87.3ms/img, 1.25GB
   Batch 8: 11.9 img/s, 83.9ms/img, 5.27GB
   Batch 16: 13.3 img/s, 75.4ms/img, 7.41GB
   Batch 32: 13.0 img/s, 76.9ms/img, 9.14GB

✓ Optimal Batch Size: 16
   Throughput: 13.3 img/s

STEP 2: TorchScript Compilation

🔧 Compiling with TorchScript (Trace)...


  * torch.tensor(scale_factors[i], dtype=torch.float32)
  boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
  boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
  assert condition, message
  torch.tensor(s, dtype=torch.float32, device=boxes.device)
  / torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)


⚠️ TorchScript compilation failed: Only tensors, lists, tuples of tensors, or dictionary of tensors can be output from traced functions

STEP 3: ONNX Export

📦 Exporting to ONNX...


  torch.onnx.export(


✓ ONNX export successful!
   File: outputs/models/model.onnx
   Size: 167.49MB
   Opset: 11

📊 Benchmarking ONNX Runtime...
⚠️ ONNX export failed: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running ScatterElements node. Name:'/roi_heads/box_roi_pool/ScatterElements' Status Message: /onnxruntime_src/onnxruntime/core/framework/bfc_arena.cc:359 void* onnxruntime::BFCArena::AllocateRawInternal(size_t, bool, onnxruntime::Stream*) Failed to allocate memory for requested buffer of size 50176000


STEP 4: Creating Deployment Package

📦 Creating Deployment Package...

1️⃣ Saving PyTorch model...
   ✓ Saved: outputs/deployment/model.pth

2️⃣ Exporting to ONNX...

📦 Exporting to ONNX...
✓ ONNX export successful!
   File: outputs/deployment/model.onnx
   Size: 167.49MB
   Opset: 11

3️⃣ Compiling to TorchScript...

🔧 Compiling with TorchScript (Trace)...
   ⚠️ TorchScript compilation failed: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTenso

In [None]:
optimizer = BatchProcessingOptimizer(model)

In [None]:
compiler = TorchScriptCompiler(model)

In [None]:
exporter = ONNXExporter(model)