In [1]:
!pip install mlflow
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights
from torch.utils.data import DataLoader
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import logging
import time
from datetime import datetime
import subprocess
import platform
import psutil
import mlflow
import mlflow.pytorch
from sklearn.metrics import classification_report, confusion_matrix

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.3-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.57.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentele

In [2]:
# Configure logging
format = '%(asctime)s - %(levelname)s - %(filename)s - PID:%(process)d - TID:%(thread)d - %(message)s'
logger = logging.getLogger(__name__ + str(time.time()))
logger.setLevel(logging.INFO)
logger.propagate = False

# Add handler if none exists
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter(format))
    logger.addHandler(handler)

In [3]:
# Data transforms for CIFAR-100 - EfficientNet requires 224x224 images
train_tf = transforms.Compose([
    transforms.Resize(224),             # Resize to 224x224 for EfficientNet
    transforms.RandomHorizontalFlip(),  # Data augmentation
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),              # Convert to tensor
    transforms.Normalize(               # Normalize with ImageNet stats
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# Load CIFAR-100 dataset
train_ds = datasets.CIFAR100(
    root="./data",
    train=True,
    download=True,
    transform=train_tf
)


from torch.utils.data import Subset
# only use first 10000 images to save time
train_ds_sub = Subset(train_ds, range(10000))



print(f"Training dataset size: {len(train_ds)}")
print(f"Number of classes: {len(train_ds.classes)}")
print(f"Class names (first 10): {train_ds.classes[:10]}")

100%|██████████| 169M/169M [00:58<00:00, 2.87MB/s]


Training dataset size: 50000
Number of classes: 100
Class names (first 10): ['apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle', 'bicycle', 'bottle']


In [4]:
# Optional: Visualize a sample of the data
def visualize_samples(dataset, num_samples=16):
    """Visualize a sample of images from the dataset"""
    # Create subset for visualization
    sample_indices = np.random.choice(len(dataset), num_samples, replace=False)

    fig, axes = plt.subplots(4, 4, figsize=(12, 12))
    axes = axes.ravel()

    for i, idx in enumerate(sample_indices):
        img_tensor, class_idx = dataset[idx]

        # Denormalize image for display
        img = img_tensor.permute(1, 2, 0)
        img = img * torch.tensor([0.229, 0.224, 0.225]) + torch.tensor([0.485, 0.456, 0.406])
        img = torch.clamp(img, 0, 1)

        axes[i].imshow(img)
        axes[i].set_title(f"Class: {dataset.classes[class_idx]}")
        axes[i].axis('off')

    plt.tight_layout()
    plt.show()

# Uncomment to visualize samples
# visualize_samples(train_ds)

In [5]:
# Create data loader
train_loader = DataLoader(
    train_ds_sub,       # use subset to save time
    batch_size=128,
    shuffle=True,
    num_workers=0,      # Set to 0 for MPS compatibility
    pin_memory=False    # Disable for MPS
)

print(f"Number of batches per epoch: {len(train_loader)}")
print(f"Total samples per epoch: {len(train_loader) * 128}")

Number of batches per epoch: 79
Total samples per epoch: 10112


In [6]:
class ComprehensiveTrainingMonitor:
    """Enhanced training monitor with MLflow integration and comprehensive metrics"""

    def __init__(self, model, optimizer, criterion, device, model_name='EfficientNet-B0',
                 dataset_name='CIFAR-100', batch_size=128, epochs=100,
                 input_size=(3, 224, 224), use_mlflow=True,
                 learning_rate=5e-3, use_pretrained=True, train_size=50000,
                 val_size=10000, num_workers=0):

        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.model_name = model_name
        self.dataset_name = dataset_name
        self.batch_size = batch_size
        self.epochs = epochs
        self.input_size = input_size
        self.use_mlflow = use_mlflow
        self.learning_rate = learning_rate
        self.use_pretrained = use_pretrained
        self.train_size = train_size
        self.val_size = val_size
        self.num_workers = num_workers

        # Tracking variables
        self.best_metric = float('-inf')
        self.epoch_times = []
        self.start_time = time.time()
        self.run_started = False

        # MLflow configuration
        self.mlflow_uri = "https://mlflow-server-631028107267.us-central1.run.app/"
        self.gcs_bucket = "gs://neuralripper-mlflow-artifacts"

        if self.use_mlflow:
            self._initialize_mlflow()

    def _initialize_mlflow(self):
        """Initialize MLflow with comprehensive experiment tracking"""
        try:
            mlflow.set_tracking_uri(self.mlflow_uri)
            mlflow.set_experiment(f"{self.model_name}-{self.dataset_name}")

            # Start run with timestamp
            run_name = f"{self.model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
            mlflow.start_run(run_name=run_name)
            self.run_started = True

            # Log all parameters
            params = {
                **self._get_model_params(),
                **self._get_system_params(),
                **self._get_environment_params(),
                **self._get_data_params(),
                **self._get_training_params(),
            }

            mlflow.log_params(params)
            logger.info(f"MLflow run started: {run_name}")

        except Exception as e:
            logger.warning(f"Failed to initialize MLflow: {e}")
            self.use_mlflow = False

    def _get_model_params(self):
        """Model architecture and hyperparameters"""
        total_params = sum(p.numel() for p in self.model.parameters())
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)

        return {
            "model_name": self.model_name,
            "total_parameters": total_params,
            "trainable_parameters": trainable_params,
            "model_type": "classification",
            "pretrained": self.use_pretrained,
            "batch_size": self.batch_size,
            "epochs": self.epochs,
            "learning_rate": self.learning_rate,
            "optimizer": self.optimizer.__class__.__name__,
            "criterion": self.criterion.__class__.__name__,
            "input_channels": self.input_size[0],
            "input_height": self.input_size[1],
            "input_width": self.input_size[2],
        }

    def _get_system_params(self):
        """System hardware and software parameters"""
        gpu_info = {}
        if torch.cuda.is_available():
            props = torch.cuda.get_device_properties(0)
            gpu_info = {
                "gpu_name": torch.cuda.get_device_name(0),
                "gpu_memory_gb": round(props.total_memory / (1024**3), 2),
                "cuda_version": torch.version.cuda,
                "num_gpus": torch.cuda.device_count(),
            }
        elif torch.backends.mps.is_available():
            gpu_info = {
                "gpu_name": "Apple Silicon MPS",
                "device_type": "mps"
            }

        return {
            "cpu_count": psutil.cpu_count(),
            "memory_total_gb": round(psutil.virtual_memory().total / (1024**3), 2),
            "platform": platform.platform(),
            "python_version": platform.python_version(),
            "pytorch_version": torch.__version__,
            **gpu_info
        }

    def _get_environment_params(self):
        """Environment and reproducibility parameters"""
        git_info = {}
        try:
            commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
            branch = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).decode().strip()
            git_info = {
                "git_commit": commit[:8],
                "git_branch": branch,
            }
        except:
            git_info = {"git_commit": "unknown", "git_branch": "unknown"}

        return git_info

    def _get_data_params(self):
        """Data pipeline parameters"""
        return {
            "train_size": self.train_size,
            "val_size": self.val_size,
            "total_samples": self.train_size + self.val_size,
            "num_workers": self.num_workers,
        }

    def _get_training_params(self):
        """Advanced training configuration"""
        return {
            "mlflow_uri": self.mlflow_uri,
            "experiment_name": f"{self.model_name}-{self.dataset_name}",
        }

    def log_epoch_metrics(self, epoch, epoch_loss, epoch_acc, batch_count=None):
        """Comprehensive epoch metrics logging"""
        if not self.use_mlflow or not self.run_started:
            return

        try:
            epoch_time = self.epoch_times[-1] if self.epoch_times else 0

            # Core metrics
            metrics = {
                "train_loss": epoch_loss,
                "train_accuracy": epoch_acc,
                "epoch_time_seconds": epoch_time,
                "learning_rate": self.optimizer.param_groups[0]['lr'],
            }

            # Performance metrics
            if batch_count:
                metrics["batches_per_second"] = batch_count / epoch_time if epoch_time > 0 else 0
                metrics["samples_per_second"] = (batch_count * self.batch_size) / epoch_time if epoch_time > 0 else 0

            # Memory metrics
            if torch.cuda.is_available():
                metrics["gpu_memory_allocated_gb"] = torch.cuda.memory_allocated() / (1024**3)
                metrics["gpu_memory_reserved_gb"] = torch.cuda.memory_reserved() / (1024**3)

            # Running statistics
            total_time = sum(self.epoch_times)
            metrics["total_time_minutes"] = total_time / 60
            metrics["average_epoch_time"] = np.mean(self.epoch_times) if self.epoch_times else 0

            mlflow.log_metrics(metrics, step=epoch)

            # Update best model if improved
            if epoch_acc > self.best_metric:
                self.best_metric = epoch_acc
                self._log_model_checkpoint(epoch)

        except Exception as e:
            logger.warning(f"Failed to log metrics for epoch {epoch}: {e}")

    def _log_model_checkpoint(self, epoch):
        """Log model checkpoint and metadata"""
        try:
            # Log the model
            mlflow.pytorch.log_model(
                self.model,
                artifact_path="model",
                registered_model_name=f"{self.model_name}-{self.dataset_name}",
                pip_requirements=["torch", "torchvision", "pillow", "numpy"]
            )

            # Log additional metadata
            model_metadata = {
                "best_epoch": epoch,
                "best_accuracy": self.best_metric,
                "checkpoint_time": datetime.now().isoformat(),
            }

            mlflow.log_params(model_metadata)
            logger.info(f"Model metadata logged for accuracy: {self.best_metric:.4f}")

        except Exception as e:
            logger.warning(f"Failed to log model metadata: {e}")

    def end_run(self, status="FINISHED"):
        """Clean up and end MLflow run"""
        if self.use_mlflow and self.run_started:
            try:
                total_time = time.time() - self.start_time
                summary = {
                    "final_total_training_time_minutes": round(total_time / 60, 2),
                    "final_best_accuracy": self.best_metric,
                    "final_epochs_completed": len(self.epoch_times),
                }
                mlflow.log_params(summary)
                mlflow.end_run(status=status)
                self.run_started = False
                return summary
            except Exception as e:
                logger.warning(f"Failed to end MLflow run properly: {e}")
                return {}
        return {}

In [7]:
class EnhancedEfficientNetB0:
    """EfficientNet-B0 with comprehensive monitoring integration"""

    def __init__(self, num_epochs=100, batch_size=128, num_classes=100,
                 learning_rate=5e-3, use_mlflow=True):
        # Core parameters
        self._num_classes = num_classes
        self._use_mlflow = use_mlflow
        self._batch_size = batch_size
        self._num_epochs = num_epochs
        self._learning_rate = learning_rate

        # Create model components
        self._model = self._create_model()
        self._device = self._set_device()
        self._model.to(self._device)
        self._criterion = self._set_criterion()
        self._optimizer = self._set_optimizer()

        # Initialize comprehensive monitor
        self.monitor = ComprehensiveTrainingMonitor(
            model=self._model,
            optimizer=self._optimizer,
            criterion=self._criterion,
            device=self._device,
            model_name='EfficientNet-B0',
            dataset_name='CIFAR-100',
            batch_size=batch_size,
            epochs=num_epochs,
            input_size=(3, 224, 224),  # EfficientNet uses 224x224
            learning_rate=learning_rate,
            use_mlflow=use_mlflow,
            use_pretrained=True,
            train_size=50000,
            val_size=10000,
            num_workers=0,
        )

        logger.info(f"Model initialized on device: {self._device}")
        logger.info(f"Model parameters: {sum(p.numel() for p in self._model.parameters()):,}")

    def _create_model(self):
        """Create EfficientNet-B0 model with CIFAR-100 adaptation"""
        model = efficientnet_b0(weights=EfficientNet_B0_Weights.DEFAULT)
        # Adapt classifier for CIFAR-100 (100 classes)
        num_features = model.classifier[1].in_features
        model.classifier[1] = nn.Linear(num_features, self._num_classes)
        return model

    def _set_device(self):
        """Set appropriate device for training"""
        if torch.backends.mps.is_available():
            return torch.device("mps")
        elif torch.cuda.is_available():
            return torch.device("cuda")
        return torch.device("cpu")

    def _set_optimizer(self):
        """Configure SGD optimizer with momentum and weight decay"""
        return optim.SGD(self._model.parameters(),
                        lr=self._learning_rate,
                        momentum=0.9,
                        weight_decay=4e-5)

    def _set_criterion(self):
        """Set loss function for classification"""
        return nn.CrossEntropyLoss()

    def train_epoch(self, data_loader, epoch_idx):
        """Enhanced training epoch with comprehensive monitoring"""
        logger.info(f"Starting epoch {epoch_idx+1}, total batches: {len(data_loader)}")

        self._model.train()
        epoch_total_loss = 0.0
        running_loss = 0.0
        running_correct = 0
        running_total = 0
        batch_count = len(data_loader)

        for idx, (images, targets) in enumerate(data_loader):
            images = images.to(self._device)
            targets = targets.to(self._device)

            self._optimizer.zero_grad()
            logits = self._model(images)
            loss = self._criterion(logits, targets)
            loss.backward()
            self._optimizer.step()

            predictions = logits.argmax(dim=1)
            running_correct += (predictions == targets).sum().item()
            running_total += targets.size(0)
            running_loss += loss.item()
            epoch_total_loss += loss.item()

            # Log progress every 10 batches
            if idx % 10 == 9:
                avg_loss = running_loss / 10
                acc_sofar = running_correct / running_total
                logger.info(f"Epoch {epoch_idx+1} | Batch {idx+1}/{batch_count} | "
                          f"Loss {avg_loss:.4f} | Acc {acc_sofar:.4f}")
                running_loss = 0.0

        # Calculate epoch metrics
        epoch_loss = epoch_total_loss / batch_count
        epoch_acc = running_correct / running_total

        logger.info(f"Epoch {epoch_idx+1} completed | Loss: {epoch_loss:.4f} | Acc: {epoch_acc:.4f}")
        return epoch_loss, epoch_acc

    def train(self, data_loader):
        """Full training loop with comprehensive monitoring"""
        logger.info("Starting training process")

        try:
            for epoch in range(self._num_epochs):
                epoch_start = time.time()

                # Train for one epoch
                epoch_loss, epoch_acc = self.train_epoch(data_loader, epoch)

                # Track timing
                epoch_time = time.time() - epoch_start
                self.monitor.epoch_times.append(epoch_time)

                # Log metrics
                self.monitor.log_epoch_metrics(epoch, epoch_loss, epoch_acc, len(data_loader))

                # Progress report
                eta = np.mean(self.monitor.epoch_times) * (self._num_epochs - epoch - 1)
                logger.info(f"Epoch {epoch+1}/{self._num_epochs} | "
                          f"Time: {epoch_time:.1f}s | ETA: {eta/60:.1f}min")

        except KeyboardInterrupt:
            logger.info("Training interrupted by user")
            summary = self.monitor.end_run(status="KILLED")
            return summary
        except Exception as e:
            logger.error(f"Training failed with error: {e}")
            summary = self.monitor.end_run(status="FAILED")
            raise e

        # Training completed
        summary = self.monitor.end_run(status="FINISHED")
        logger.info(f"Training completed. Summary: {summary}")

        if summary:
            print(f"\nTraining Summary:")
            print(f"Total time: {summary.get('final_total_training_time_minutes', 0):.1f} minutes")
            print(f"Best accuracy: {summary.get('final_best_accuracy', 0):.4f}")
            print(f"Epochs completed: {summary.get('final_epochs_completed', 0)}")

        return summary

In [8]:
# Initialize and train the model
logger.info("Initializing Enhanced EfficientNet-B0 model")

model = EnhancedEfficientNetB0(
    num_epochs=20,
    batch_size=128,
    num_classes=100,
    learning_rate=5e-3,
    use_mlflow=True
)

print("Model initialized successfully!")
print(f"Device: {model._device}")
print(f"Total parameters: {sum(p.numel() for p in model._model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model._model.parameters() if p.requires_grad):,}")

2025-07-09 03:56:33,904 - INFO - ipython-input-8-4061902713.py - PID:566 - TID:135839759893120 - Initializing Enhanced EfficientNet-B0 model
Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 101MB/s]
2025-07-09 03:56:36,944 - INFO - ipython-input-6-3490507291.py - PID:566 - TID:135839759893120 - MLflow run started: EfficientNet-B0_20250709_035635
2025-07-09 03:56:36,945 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Model initialized on device: cuda
2025-07-09 03:56:36,947 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Model parameters: 4,135,648


Model initialized successfully!
Device: cuda
Total parameters: 4,135,648
Trainable parameters: 4,135,648


In [9]:
# Start training with comprehensive monitoring
print("\n" + "="*50)
print(" STARTING ENHANCED EFFICIENTNET-B0 TRAINING")
print("="*50)
print(f"Dataset: CIFAR-100 (100 classes)")
print(f"Model: EfficientNet-B0 (pretrained)")
print(f"Epochs: {model._num_epochs}")
print(f"Batch size: {model._batch_size}")
print(f"Learning rate: {model._learning_rate}")
print(f"MLflow tracking: {'Enabled' if model._use_mlflow else 'Disabled'}")
print(f"MLflow URI: {model.monitor.mlflow_uri}")
print("="*50 + "\n")

# Start training
model.train(train_loader)

2025-07-09 03:56:36,957 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Starting training process
2025-07-09 03:56:36,958 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Starting epoch 1, total batches: 79



 STARTING ENHANCED EFFICIENTNET-B0 TRAINING
Dataset: CIFAR-100 (100 classes)
Model: EfficientNet-B0 (pretrained)
Epochs: 20
Batch size: 128
Learning rate: 0.005
MLflow tracking: Enabled
MLflow URI: https://mlflow-server-631028107267.us-central1.run.app/



2025-07-09 03:56:48,652 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Epoch 1 | Batch 10/79 | Loss 4.6090 | Acc 0.0086
2025-07-09 03:56:58,260 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Epoch 1 | Batch 20/79 | Loss 4.5920 | Acc 0.0109
2025-07-09 03:57:07,987 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Epoch 1 | Batch 30/79 | Loss 4.5478 | Acc 0.0151
2025-07-09 03:57:17,705 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Epoch 1 | Batch 40/79 | Loss 4.4830 | Acc 0.0223
2025-07-09 03:57:27,222 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Epoch 1 | Batch 50/79 | Loss 4.4225 | Acc 0.0334
2025-07-09 03:57:36,816 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Epoch 1 | Batch 60/79 | Loss 4.3618 | Acc 0.0449
2025-07-09 03:57:46,371 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Epoch 1 | Batch 70/79 | Los

🏃 View run EfficientNet-B0_20250709_035635 at: https://mlflow-server-631028107267.us-central1.run.app/#/experiments/3/runs/1297c31f34b14950bb20e260969318b5
🧪 View experiment at: https://mlflow-server-631028107267.us-central1.run.app/#/experiments/3


2025-07-09 04:24:32,878 - INFO - ipython-input-7-1728625066.py - PID:566 - TID:135839759893120 - Training completed. Summary: {'final_total_training_time_minutes': 27.94, 'final_best_accuracy': 0.9524, 'final_epochs_completed': 20}



Training Summary:
Total time: 27.9 minutes
Best accuracy: 0.9524
Epochs completed: 20


{'final_total_training_time_minutes': 27.94,
 'final_best_accuracy': 0.9524,
 'final_epochs_completed': 20}