In [None]:
!pip install mlflow thop torchvision
# Core imports and setup
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms, datasets
from torchvision.models import MobileNet_V2_Weights
from torch.utils.data import DataLoader, Subset

import mlflow
import mlflow.pytorch
from mlflow.models import infer_signature

import numpy as np
import psutil
import platform
import subprocess
from datetime import datetime
from thop import profile

import logging
import sys
import matplotlib.pyplot as plt

print(f"PyTorch version: {torch.__version__}")
print(f"Torchvision version: {torchvision.__version__}")
print(f"MLflow version: {mlflow.__version__}")

In [28]:
# Setup logging
format_str = '%(asctime)s - %(levelname)s - %(filename)s - PID:%(process)d - TID:%(thread)d - %(message)s'
logger = logging.getLogger(__name__ + str(time.time()))
logger.setLevel(logging.DEBUG)
logger.propagate = False
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(format_str))
logger.addHandler(handler)

logger.debug("Logger initialization completed")

2025-07-17 01:10:59,900 - DEBUG - 1078719332.py - PID:21996 - TID:8462606080 - Logger initialization completed


In [44]:
class ComprehensiveTrainingMonitor:
    """Advanced training monitor with complete metrics tracking for MLflow"""

    def __init__(self, model, optimizer, criterion, device, model_name, dataset_name,
                 batch_size=32, epochs=10, input_size=(3, 32, 32),
                 mlflow_uri="https://mlflow-server-631028107267.us-central1.run.app",
                 use_mlflow=True, **kwargs):
        # Core components
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.model_name = model_name
        self.dataset_name = dataset_name

        # Training parameters
        self.batch_size = batch_size
        self.epochs = epochs
        self.input_size = input_size
        self.mlflow_uri = mlflow_uri
        self.use_mlflow = use_mlflow

        # Auto-extracted parameters
        self.lr = optimizer.param_groups[0]['lr']
        self.optimizer_name = optimizer.__class__.__name__
        self.loss_name = criterion.__class__.__name__
        self.weight_decay = optimizer.param_groups[0].get('weight_decay', 0)
        self.momentum = optimizer.param_groups[0].get('momentum', 0)

        # Training state
        self.start_time = time.time()
        self.epoch_times = []
        self.best_metric = -1
        self.prev_loss = None
        self.batch_times = []
        self.run_started = False  # Track run state

        # Optional parameters
        self.train_size = kwargs.get('train_size', 0)
        self.val_size = kwargs.get('val_size', 0)
        self.num_workers = kwargs.get('num_workers', 0)
        self.use_pretrained = kwargs.get('use_pretrained', False)
        self.random_seed = kwargs.get('random_seed', 42)

        self.run_id = None  # Mlflow current run id

    def setup_mlflow(self):
        """Setup MLflow with comprehensive parameter logging"""
        if not self.use_mlflow or self.run_started:
            return

        try:
            # Ensure any existing run is ended
            if mlflow.active_run():
                mlflow.end_run()

            # Configure MLflow client
            mlflow.set_tracking_uri(self.mlflow_uri)
            mlflow.set_experiment(f"{self.model_name}-{self.dataset_name}")

            # Start single run
            run_name = datetime.now().strftime("%Y%m%d_%H%M%S")
            mlflow.start_run(run_name=run_name)
            self.run_id = run_name
            self.run_started = True

            # Log comprehensive parameters
            all_params = {
                **self._get_core_params(),
                **self._get_model_params(),
                **self._get_system_params(),
                **self._get_environment_params(),
                **self._get_data_params(),
                **self._get_training_params(),
            }

            mlflow.log_params(all_params)
            logger.info(f"MLflow tracking initialized: {self.mlflow_uri}")

        except Exception as e:
            logger.error(f"MLflow setup failed: {e}")
            self.use_mlflow = False

    def _get_core_params(self):
        """Core training parameters"""
        return {
            "model_name": self.model_name,
            "dataset_name": self.dataset_name,
            "batch_size": self.batch_size,
            "learning_rate": self.lr,
            "epochs": self.epochs,
            "optimizer": self.optimizer_name,
            "loss_function": self.loss_name,
            "device": str(self.device),
            "random_seed": self.random_seed,
            "weight_decay": self.weight_decay,
            "momentum": self.momentum,
        }

    def _get_model_params(self):
        """Model architecture parameters"""
        total_params = sum(p.numel() for p in self.model.parameters())
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)

        # Model size calculation
        param_size = sum(p.numel() * p.element_size() for p in self.model.parameters())
        buffer_size = sum(b.numel() * b.element_size() for b in self.model.buffers())
        model_size_mb = (param_size + buffer_size) / (1024 ** 2)

        # FLOPs calculation
        flops_m = 0
        try:
            sample_input = torch.randn(1, *self.input_size)
            flops, _ = profile(self.model, inputs=(sample_input,), verbose=False)
            flops_m = round(flops / 1e6, 2)
        except Exception as e:
            logger.warning(f"FLOPs calculation failed: {e}")

        return {
            "total_parameters": total_params,
            "trainable_parameters": trainable_params,
            "parameters_millions": round(total_params / 1e6, 2),
            "model_size_mb": round(model_size_mb, 2),
            "flops_millions": flops_m,
            "input_size": str(self.input_size),
            "use_pretrained": self.use_pretrained,
            "model_architecture": self.model.__class__.__name__,
        }

    def _get_system_params(self):
        """System and hardware parameters"""
        gpu_info = {}
        if torch.cuda.is_available():
            props = torch.cuda.get_device_properties(0)
            gpu_info = {
                "gpu_name": torch.cuda.get_device_name(0),
                "gpu_memory_gb": round(props.total_memory / (1024**3), 2),
                "cuda_version": torch.version.cuda,
                "num_gpus": torch.cuda.device_count(),
            }
        elif torch.backends.mps.is_available():
            gpu_info = {
                "gpu_name": "Apple Silicon MPS",
                "device_type": "mps"
            }

        return {
            "cpu_count": psutil.cpu_count(),
            "memory_total_gb": round(psutil.virtual_memory().total / (1024**3), 2),
            "platform": platform.platform(),
            "python_version": platform.python_version(),
            "pytorch_version": torch.__version__,
            **gpu_info
        }

    def _get_environment_params(self):
        """Environment and reproducibility parameters"""
        git_info = {}
        try:
            commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
            branch = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).decode().strip()
            git_info = {
                "git_commit": commit[:8],
                "git_branch": branch,
            }
        except:
            git_info = {"git_commit": "unknown", "git_branch": "unknown"}

        return git_info

    def _get_data_params(self):
        """Data pipeline parameters"""
        return {
            "train_size": self.train_size,
            "val_size": self.val_size,
            "total_samples": self.train_size + self.val_size,
            "num_workers": self.num_workers,
        }

    def _get_training_params(self):
        """Advanced training configuration"""
        return {
            "mlflow_uri": self.mlflow_uri,
            "experiment_name": f"{self.model_name}-{self.dataset_name}",
        }

    def log_epoch_metrics(self, epoch, epoch_loss, epoch_acc, batch_count=None):
        """Comprehensive epoch metrics logging"""
        if not self.use_mlflow or not self.run_started:
            return {}

        try:
            current_time = time.time()

            # Timing metrics
            epoch_time = current_time - (self.start_time if epoch == 0 else self.start_time + sum(self.epoch_times))
            self.epoch_times.append(epoch_time)

            # Core metrics
            metrics = {
                "epoch": epoch,
                "train_loss": epoch_loss,
                "train_accuracy": epoch_acc,
                "learning_rate": self.optimizer.param_groups[0]["lr"],
                "epoch_time_seconds": epoch_time,
                "total_time_seconds": current_time - self.start_time,
                "avg_epoch_time": sum(self.epoch_times) / len(self.epoch_times),
            }

            # Training dynamics
            if self.prev_loss is not None:
                loss_improvement = self.prev_loss - epoch_loss
                metrics.update({
                    "loss_improvement": loss_improvement,
                    "loss_improvement_percent": (loss_improvement / self.prev_loss) * 100 if self.prev_loss != 0 else 0,
                })
            self.prev_loss = epoch_loss

            # Performance metrics
            if batch_count and epoch_time > 0:
                samples_per_sec = (self.batch_size * batch_count) / epoch_time
                metrics.update({
                    "batches_per_second": batch_count / epoch_time,
                    "samples_per_second": samples_per_sec,
                })

            # GPU/MPS metrics
            if torch.cuda.is_available():
                gpu_metrics = self._get_gpu_metrics()
                metrics.update(gpu_metrics)

            # System metrics
            system_metrics = self._get_system_metrics()
            metrics.update(system_metrics)

            mlflow.log_metrics(metrics, step=epoch)
            return metrics

        except Exception as e:
            logger.warning(f"Failed to log epoch metrics: {e}")
            return {}

    def _get_gpu_metrics(self):
        """GPU metrics for CUDA"""
        if not torch.cuda.is_available():
            return {}

        allocated = torch.cuda.memory_allocated()
        reserved = torch.cuda.memory_reserved()
        total = torch.cuda.get_device_properties(0).total_memory

        return {
            "gpu_memory_allocated_mb": allocated / (1024**2),
            "gpu_memory_reserved_mb": reserved / (1024**2),
            "gpu_memory_allocated_percent": (allocated / total) * 100,
            "gpu_memory_reserved_percent": (reserved / total) * 100,
        }

    def _get_system_metrics(self):
        """Real-time system resource utilization"""
        cpu_percent = psutil.cpu_percent(interval=None)
        memory = psutil.virtual_memory()

        return {
            "cpu_percent": cpu_percent,
            "memory_used_percent": memory.percent,
            "memory_available_gb": memory.available / (1024**3),
        }

    def should_log_model(self, current_metric, metric_name="accuracy"):
        """Enhanced model checkpointing with improvement tracking"""
        if current_metric > self.best_metric:
            improvement = current_metric - self.best_metric
            self.best_metric = current_metric

            if self.use_mlflow and self.run_started:
                try:
                    mlflow.log_metrics({
                        f"best_{metric_name}": current_metric,
                        f"{metric_name}_improvement": improvement,
                        "checkpoint_epoch": len(self.epoch_times),
                    }, step=len(self.epoch_times))
                except Exception as e:
                    logger.warning(f"Failed to log best model metrics: {e}")

            return True
        return False

    def log_model_artifact(self, signature=None):
        """Log model with comprehensive metadata (metrics only, no artifacts)"""
        if not self.use_mlflow or not self.run_started:
            return

        try:
            # Log model metadata as parameters instead of artifacts to avoid GCS issues
            model_metadata = {
                "best_model_architecture": self.model.__class__.__name__,
                "best_model_accuracy": self.best_metric,
                "best_model_training_time_hours": (time.time() - self.start_time) / 3600,
                "best_model_epochs_trained": len(self.epoch_times),
                "best_model_parameters_count": sum(p.numel() for p in self.model.parameters()),
            }

            mlflow.log_params(model_metadata)
            logger.info(f"Model metadata logged for accuracy: {self.best_metric:.4f}")

        except Exception as e:
            logger.warning(f"Failed to log model metadata: {e}")

    def end_run(self, status="FINISHED"):
        """Clean up and end MLflow run"""
        if self.use_mlflow and self.run_started:
            try:
                total_time = time.time() - self.start_time
                summary = {
                    "final_total_training_time_minutes": round(total_time / 60, 2),
                    "final_best_accuracy": self.best_metric,
                    "final_epochs_completed": len(self.epoch_times),
                }
                mlflow.log_params(summary)
                mlflow.end_run(status=status)
                self.run_started = False
                return summary
            except Exception as e:
                logger.warning(f"Failed to end MLflow run properly: {e}")
                return {}
        return {}

In [155]:
import requests
import rerun as rr


class RerunLogger:
    def __init__(self, run_id: str, model_name:str, backend_url: str = "http://localhost:8000"):
        self.run_id = run_id
        self.model_name = model_name
        self.backend_url = backend_url
        self.gcs_bucket = "neuralripper-mlflow-artifacts"
        self.viewer_url = None
        self.internal_url = None
        self.initialized = False
    
    def get_viewer_urls(self):
        """Get viewer URLs from backend"""
        try:
            res = requests.get(f"{self.backend_url}/rerun/{self.run_id}/live")
            if res.status_code == 200:
                data = res.json()
                self.viewer_url = data["viewer_url"]
                self.internal_url = data["internal_url"]
                return True
            return False
        except Exception as e:
            print(f"Failed to get viewer URLs: {e}")
            return False
    
    def initialize(self):
        """Init rerun connection"""
        if self.get_viewer_urls():
            logger.debug(f"internal_url: {self.internal_url}")
            logger.debug(f"viewer_url: {self.viewer_url}")

            # Extract gRPC port from internal url, use for uploading recordings
            grpc_port = self.internal_url.split(':')[-1]

            # Init recording
            rr.init(f"{self.model_name}_{self.run_id}", spawn=False)

            # Connect to backend gRPC server(send the recordings data to backend so it can serve in viewer url)
            rr.connect_grpc(f"127.0.0.1:{grpc_port}")
            # rr.connect_grpc(grpc_port)
            # rr.connect_grpc()

            self.initialized = True
            print(f"Rerun live viewer: {self.viewer_url}")
            print(f"Connected to gRPC: 127.0.0.1:{grpc_port}")
            return True
        return False
    
    def log_training_step(self, epoch: int, step: int, loss: float, accuracy: float, 
                          lr: float, gpu_memory: dict = None, system_metrics: dict = None):
        """Log training metrics to rerun"""
        if not self.initialized:
            return
            
        rr.log("training/loss", rr.Scalars(loss))
        rr.log("training/accuracy", rr.Scalars(accuracy))
        rr.log("training/learning_rate", rr.Scalars(lr))
        rr.log("progress/epoch", rr.Scalars(epoch))
        rr.log("progress/step", rr.Scalars(step))
        
        if gpu_memory:
            rr.log("system/gpu_memory_mb", rr.Scalars(gpu_memory.get('gpu_memory_allocated_mb', 0)))
        if system_metrics:
            rr.log("system/cpu_percent", rr.Scalars(system_metrics.get('cpu_percent', 0)))
            rr.log("system/memory_percent", rr.Scalars(system_metrics.get('memory_used_percent', 0)))

    def save_recording(self):
        """Save recording directly to GCS"""
        if not self.initialized:
            return None
            
        try:
            gcs_path = f"gs://{self.gcs_bucket}/rerun_recordings/{self.model_name}_{self.run_id}.rrd"
            rr.save(gcs_path)
            print(f"Recording saved: {gcs_path}")
            return gcs_path
        except Exception as e:
            print(f"Failed to save recording: {e}")
            return None




In [156]:
class EnhancedMobileNetV2:
    """MobileNetV2 with comprehensive monitoring integration"""

    def __init__(self, num_epochs=10, batch_size=128, num_classes=100,
                 learning_rate=5e-3, use_mlflow=True):
        # Core parameters
        self._num_classes = num_classes
        self._use_mlflow = use_mlflow
        self._batch_size = batch_size
        self._num_epochs = num_epochs
        self._learning_rate = learning_rate

        # Create model components
        self._model = self._create_model()
        self._device = self._set_device()
        self._model.to(self._device)
        self._criterion = self._set_criterion()
        self._optimizer = self._set_optimizer()

        # Initialize comprehensive monitor
        self.monitor = ComprehensiveTrainingMonitor(
            model=self._model,
            optimizer=self._optimizer,
            criterion=self._criterion,
            device=self._device,
            model_name='MobileNetV2',
            dataset_name='CIFAR-100',
            batch_size=batch_size,
            epochs=num_epochs,
            input_size=(3, 32, 32),
            use_mlflow=use_mlflow,
            use_pretrained=True,
            train_size=50000,
            val_size=10000,
            num_workers=0,
        )

        logger.info(f"Model initialized on device: {self._device}")
        logger.info(f"Model parameters: {sum(p.numel() for p in self._model.parameters()):,}")

        self.rerun_logger = None    # rerun logger will be init when training starts

    def _create_model(self):
        """Create MobileNetV2 model with CIFAR-100 adaptation"""
        model = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT)
        # Adapt classifier for CIFAR-100 (100 classes)
        model.classifier[1] = nn.Linear(1280, self._num_classes)
        return model

    def _set_device(self):
        """Set appropriate device for training"""
        if torch.backends.mps.is_available():
            return torch.device("mps")
        elif torch.cuda.is_available():
            return torch.device("cuda")
        return torch.device("cpu")

    def _set_optimizer(self):
        """Configure SGD optimizer with momentum and weight decay"""
        return optim.SGD(self._model.parameters(),
                        lr=self._learning_rate,
                        momentum=0.9,
                        weight_decay=4e-5)

    def _set_criterion(self):
        """Set loss function for classification"""
        return nn.CrossEntropyLoss()

    def train_epoch(self, data_loader, epoch_idx):
        """Enhanced training epoch with comprehensive monitoring"""
        logger.info(f"Starting epoch {epoch_idx+1}, total batches: {len(data_loader)}")

        self._model.train()
        epoch_total_loss = 0.0
        running_loss = 0.0
        running_correct = 0
        running_total = 0
        batch_count = len(data_loader)

        for idx, (images, targets) in enumerate(data_loader):
            images = images.to(self._device)
            targets = targets.to(self._device)

            self._optimizer.zero_grad()
            logits = self._model(images)
            loss = self._criterion(logits, targets)
            loss.backward()
            self._optimizer.step()

            predictions = logits.argmax(dim=1)
            running_correct += (predictions == targets).sum().item()
            running_total += targets.size(0)
            running_loss += loss.item()
            epoch_total_loss += loss.item()

            # Progress logging every 10 batches
            if idx % 10 == 9:
                avg_loss = running_loss / 10
                curr_acc = running_correct / running_total
                logger.info(f"Epoch {epoch_idx + 1} | Batch {idx + 1} | "
                          f"Avg Loss: {avg_loss:.4f} | Accuracy: {curr_acc:.4f}")
                running_loss = 0.0

                # Rerun loggings
                if self.rerun_logger:
                    gpu_metrics = self.monitor._get_gpu_metrics()
                    sys_metrics = self.monitor._get_system_metrics()

                    self.rerun_logger.log_training_step(
                        epoch=epoch_idx,
                        step=(epoch_idx * len(data_loader)) + idx,
                        loss=avg_loss,
                        accuracy=curr_acc,
                        lr=self._optimizer.param_groups[0]['lr'],
                        gpu_memory=gpu_metrics,
                        system_metrics=sys_metrics
                    )


        epoch_loss = epoch_total_loss / batch_count
        epoch_acc = running_correct / running_total

        logger.info(f"Epoch {epoch_idx + 1} completed - Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}")
        return epoch_loss, epoch_acc, batch_count

    def train(self, train_loader):
        """Enhanced training with comprehensive monitoring"""
        try:
            logger.info(f"Starting training for {self._num_epochs} epochs on {self._device}")

            # Setup MLflow once at the beginning
            self.monitor.setup_mlflow()

            # Init RerunLogger
            if self.monitor.run_id:
                self.rerun_logger = RerunLogger(self.monitor.run_id, "MobileNetV2")
                self.rerun_logger.initialize()

            for epoch in range(self._num_epochs):
                epoch_loss, epoch_acc, batch_count = self.train_epoch(train_loader, epoch)

                # Log comprehensive epoch metrics
                metrics = self.monitor.log_epoch_metrics(epoch, epoch_loss, epoch_acc, batch_count)

                # Model checkpointing for best performance (metadata only)
                if self.monitor.should_log_model(epoch_acc):
                    self.monitor.log_model_artifact()
                    logger.info(f"New best model metadata saved with accuracy: {epoch_acc:.4f}")

                # Enhanced progress display
                print(f"Epoch {epoch+1}/{self._num_epochs}: "
                      f"Loss: {epoch_loss:.4f} | Acc: {epoch_acc:.4f} | "
                      f"Time: {metrics.get('epoch_time_seconds', 0):.1f}s | "
                      f"LR: {metrics.get('learning_rate', 0):.2e} | "
                      f"Memory: {metrics.get('memory_used_percent', 0):.1f}%")

        except Exception as e:
            logger.error(f"Training failed: {e}")
            self.monitor.end_run(status="FAILED")
            raise
        finally:
            # Save rerun recording after training
            if self.rerun_logger:
                self.rerun_logger.save_recording()

            summary = self.monitor.end_run()
            logger.info(f"Training completed. Summary: {summary}")
            print(f"\nTraining Summary:")
            print(f"Total time: {summary.get('final_total_training_time_minutes', 0):.1f} minutes")
            print(f"Best accuracy: {summary.get('final_best_accuracy', 0):.4f}")
            print(f"Epochs completed: {summary.get('final_epochs_completed', 0)}")

In [157]:
# Data transforms for CIFAR-100
train_tf = transforms.Compose([
    transforms.RandomCrop(32, padding=4),  # Data augmentation
    transforms.RandomHorizontalFlip(0.5),   # Random horizontal flip
    transforms.ToTensor(),                  # Convert to tensor
    transforms.Normalize(                   # Normalize with ImageNet stats
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# Load CIFAR-100 dataset
train_ds = datasets.CIFAR100(
    root="./data",
    train=True,
    download=True,
    transform=train_tf
)

from torch.utils.data import Subset

# only use first 10000 images for local training
train_subset = Subset(train_ds, range(10000))

print(f"Training dataset size: {len(train_ds)}")
print(f"Number of classes: {len(train_ds.classes)}")
print(f"Class names (first 10): {train_ds.classes[:10]}")
print(f"Subset size: {len(train_subset)}")

Training dataset size: 50000
Number of classes: 100
Class names (first 10): ['apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle', 'bicycle', 'bottle']
Subset size: 10000


In [158]:
# Optional: Visualize a sample of the data
def visualize_samples(dataset, num_samples=16):
    """Visualize a sample of images from the dataset"""
    # Create subset for visualization
    sample_indices = np.random.choice(len(dataset), num_samples, replace=False)

    fig, axes = plt.subplots(4, 4, figsize=(12, 12))
    axes = axes.ravel()

    for i, idx in enumerate(sample_indices):
        img_tensor, class_idx = dataset[idx]

        # Denormalize image for display
        img = img_tensor.permute(1, 2, 0)
        img = img * torch.tensor([0.229, 0.224, 0.225]) + torch.tensor([0.485, 0.456, 0.406])
        img = torch.clamp(img, 0, 1)

        axes[i].imshow(img)
        axes[i].set_title(f"Class: {dataset.classes[class_idx]}")
        axes[i].axis('off')

    plt.tight_layout()
    plt.show()

# Uncomment to visualize samples
# visualize_samples(train_ds)

In [159]:
# Create data loader
train_loader = DataLoader(
    # train_subset,       # subset for local test, change to original ds for production
    train_ds,
    batch_size=128,
    shuffle=True,
    num_workers=0,      # Set to 0 for MPS compatibility
    pin_memory=False    # Disable for MPS
)

print(f"Number of batches per epoch: {len(train_loader)}")
print(f"Total samples per epoch: {len(train_loader) * 128}")

Number of batches per epoch: 391
Total samples per epoch: 50048


In [160]:
# Initialize and train the model
logger.info("Initializing Enhanced MobileNetV2 model")

model = EnhancedMobileNetV2(
    num_epochs=5,
    batch_size=128,
    num_classes=100,
    learning_rate=3e-3,
    use_mlflow=True
)

print("Model initialized successfully!")
print(f"Device: {model._device}")
print(f"Total parameters: {sum(p.numel() for p in model._model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model._model.parameters() if p.requires_grad):,}")

2025-07-18 01:12:25,852 - INFO - 2346846570.py - PID:21996 - TID:8462606080 - Initializing Enhanced MobileNetV2 model
2025-07-18 01:12:26,050 - INFO - 3724282253.py - PID:21996 - TID:8462606080 - Model initialized on device: mps
2025-07-18 01:12:26,051 - INFO - 3724282253.py - PID:21996 - TID:8462606080 - Model parameters: 2,351,972
Model initialized successfully!
Device: mps
Total parameters: 2,351,972
Trainable parameters: 2,351,972


In [161]:
# Start training with comprehensive monitoring
print("\n" + "="*50)
print(" STARTING ENHANCED MOBILENETV2 TRAINING")
print("="*50)
print(f"Dataset: CIFAR-100 (100 classes)")
print(f"Model: MobileNetV2 (pretrained)")
print(f"Epochs: {model._num_epochs}")
print(f"Batch size: {model._batch_size}")
print(f"Learning rate: {model._learning_rate}")
print(f"MLflow tracking: {'Enabled' if model._use_mlflow else 'Disabled'}")
print(f"MLflow URI: {model.monitor.mlflow_uri}")
print("="*50 + "\n")

# Start training
model.train(train_loader)


 STARTING ENHANCED MOBILENETV2 TRAINING
Dataset: CIFAR-100 (100 classes)
Model: MobileNetV2 (pretrained)
Epochs: 5
Batch size: 128
Learning rate: 0.003
MLflow tracking: Enabled
MLflow URI: https://mlflow-server-631028107267.us-central1.run.app

2025-07-18 01:12:26,114 - INFO - 3724282253.py - PID:21996 - TID:8462606080 - Starting training for 5 epochs on mps
2025-07-18 01:12:28,727 - INFO - 821454703.py - PID:21996 - TID:8462606080 - MLflow tracking initialized: https://mlflow-server-631028107267.us-central1.run.app
2025-07-18 01:12:28,746 - DEBUG - 638891431.py - PID:21996 - TID:8462606080 - internal_url: localhost:64237
2025-07-18 01:12:28,747 - DEBUG - 638891431.py - PID:21996 - TID:8462606080 - viewer_url: rerun://localhost:64237
2025-07-18 01:12:28,748 - ERROR - 3724282253.py - PID:21996 - TID:8462606080 - Training failed: invalid endpoint "127.0.0.1:64237": Unexpected URI:: 127.0.0.1:64237
🏃 View run 20250718_011226 at: https://mlflow-server-631028107267.us-central1.run.app/#/ex

RuntimeError: invalid endpoint "127.0.0.1:64237": Unexpected URI:: 127.0.0.1:64237

[90m[[0m2025-07-19T06:52:27Z [1m[31mERROR[0m re_grpc_client::message_proxy::write[90m][0m Write messages call failed: status: Unknown, message: "transport error", details: [], metadata: MetadataMap { headers: {} }
