In [None]:
import pandas as pd
from meta_project.data.data_loader import DataLoader
data_loader = DataLoader()

df = data_loader.load_and_merge_data()
df.head()


[32m2025-04-06 11:38:34.072[0m | [1mINFO    [0m | [36mmeta_project.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\Michal\Desktop\MAML\metaLearningCUB-200-2011[0m


   id                                         image_name  image_id  class_id  \
0   1  001.Black_footed_Albatross/Black_Footed_Albatr...         1         1   
1   2  001.Black_footed_Albatross/Black_Footed_Albatr...         2         1   
2   3  001.Black_footed_Albatross/Black_Footed_Albatr...         3         1   
3   4  001.Black_footed_Albatross/Black_Footed_Albatr...         4         1   
4   5  001.Black_footed_Albatross/Black_Footed_Albatr...         5         1   

                   class_name  is_training_image  
0  001.Black_footed_Albatross                  0  
1  001.Black_footed_Albatross                  1  
2  001.Black_footed_Albatross                  0  
3  001.Black_footed_Albatross                  1  
4  001.Black_footed_Albatross                  1  


Unnamed: 0,id,image_name,image_id,class_id,class_name,is_training_image
0,1,001.Black_footed_Albatross/Black_Footed_Albatr...,1,1,001.Black_footed_Albatross,0
1,2,001.Black_footed_Albatross/Black_Footed_Albatr...,2,1,001.Black_footed_Albatross,1
2,3,001.Black_footed_Albatross/Black_Footed_Albatr...,3,1,001.Black_footed_Albatross,0
3,4,001.Black_footed_Albatross/Black_Footed_Albatr...,4,1,001.Black_footed_Albatross,1
4,5,001.Black_footed_Albatross/Black_Footed_Albatr...,5,1,001.Black_footed_Albatross,1


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, Subset
import torchvision.transforms as transforms
import torchvision.models as models
import torchvision.datasets as datasets
import os
import time
import psutil
import gc
import random
import numpy as np
import copy # To deepcopy datasets if needed, though Subset approach is better

# --- Configuration ---
ROOT_DIR = os.path.join(os.getcwd(), "data", "raw", "CUB_200_2011", "images")
EPOCHS = 5 # Reduce epochs for faster hyperparameter search trials
TEST_SPLIT_RATIO = 0.25
NUM_WORKERS = 4 # Adjust based on your system
SEED = 42 # For reproducibility

# --- Hyperparameter Search Space ---
NUM_TRIALS = 5 # Number of random configurations to try

HP_SEARCH_SPACE = {
    'lr': [1e-5, 1e-4, 5e-4, 1e-3],
    'batch_size': [8, 16, 32], # Be careful with GPU memory for larger batches
    'optimizer': ['Adam', 'SGD'],
    'sgd_momentum': [0.9] # Only relevant if SGD is chosen
}

# --- Setup ---
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    # Might make things slower, but ensures reproducibility for convolutions
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

if not os.path.exists(ROOT_DIR):
    raise FileNotFoundError(f"Dataset directory not found: {ROOT_DIR}. "
                         "Please ensure the CUB_200_2011 dataset is downloaded and extracted correctly.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device == torch.device("cuda"):
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

# --- Dataset Loading and Splitting (Done ONCE) ---
print("Loading dataset metadata...")
full_dataset_meta = datasets.ImageFolder(root=ROOT_DIR) # Load once to get classes and size
class_names = full_dataset_meta.classes
num_classes = len(class_names)

if num_classes == 0:
    raise ValueError(f"No classes found in {ROOT_DIR}. Check dataset structure.")
if num_classes != 200:
     print(f"Warning: Expected 200 classes, but found {num_classes}. Check dataset structure.")

total_size = len(full_dataset_meta)
test_size = int(TEST_SPLIT_RATIO * total_size)
train_size = total_size - test_size
print(f"Total images: {total_size}. Splitting into Train ({train_size}) and Test ({test_size}).")

# Define transforms
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

transform_train = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    normalize,
])

transform_test = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize,
])

# Create datasets with respective transforms *before* subsetting
# This is important so subsets refer to correctly transformed data
train_dataset_base = datasets.ImageFolder(root=ROOT_DIR, transform=transform_train)
test_dataset_base = datasets.ImageFolder(root=ROOT_DIR, transform=transform_test)

# Generate split indices
generator = torch.Generator().manual_seed(SEED)
indices = list(range(total_size))
train_indices, test_indices = random_split(indices, [train_size, test_size], generator=generator)

# Create subsets using the *same* indices but different base datasets (with different transforms)
train_subset = Subset(train_dataset_base, train_indices)
test_subset = Subset(test_dataset_base, test_indices)
print("Dataset split using Subsets complete.")

del full_dataset_meta, train_dataset_base, test_dataset_base # Free up memory


# --- Training and Evaluation Function ---
def train_evaluate(config, trial_num):
    """Trains and evaluates a model for one hyperparameter configuration."""
    print("\n" + "="*60)
    print(f"--- Starting Trial {trial_num+1}/{NUM_TRIALS} ---")
    print(f"Config: {config}")
    print("="*60)

    # Extract config
    lr = config['lr']
    batch_size = config['batch_size']
    optimizer_name = config['optimizer']

    # Create DataLoaders for this trial (important to use current batch_size)
    use_persistent_workers = NUM_WORKERS > 0
    try:
        train_loader = DataLoader(
            train_subset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=NUM_WORKERS,
            pin_memory=True,
            persistent_workers=use_persistent_workers,
            # Use drop_last=True if batch size doesn't evenly divide dataset size
            # drop_last=True
        )
        test_loader = DataLoader(
            test_subset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=NUM_WORKERS,
            pin_memory=True,
            persistent_workers=use_persistent_workers
        )
    except RuntimeError as e:
         if "DataLoader worker (pid" in str(e) and use_persistent_workers:
             print("\nWARNING: Caught DataLoader worker issue with persistent_workers=True.")
             print("Retrying with persistent_workers=False for this trial.")
             print("This might happen after previous crashes or forceful interruptions.\n")
             use_persistent_workers = False # Disable for this trial
             train_loader = DataLoader(
                 train_subset, batch_size=batch_size, shuffle=True,
                 num_workers=NUM_WORKERS, pin_memory=True, persistent_workers=False
             )
             test_loader = DataLoader(
                 test_subset, batch_size=batch_size, shuffle=False,
                 num_workers=NUM_WORKERS, pin_memory=True, persistent_workers=False
             )
         else:
            raise e # Reraise other runtime errors

    print(f"DataLoaders created for trial {trial_num+1} (Batch Size: {batch_size}, Persistent Workers: {use_persistent_workers}).")


    # Create Model for this trial (load fresh weights)
    model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_classes)
    model.to(device)
    print("Model created and moved to device.")

    # Create Optimizer for this trial
    if optimizer_name == 'Adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
    elif optimizer_name == 'SGD':
        optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, momentum=config['sgd_momentum'])
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")
    print(f"Optimizer ({optimizer_name}) created.")

    criterion = nn.CrossEntropyLoss()

    print(f"\n--- Training Trial {trial_num+1} for {EPOCHS} Epochs ---")
    print(f"Initial RAM Usage: {psutil.virtual_memory().percent}% Used")
    if device == torch.device("cuda"):
        torch.cuda.reset_peak_memory_stats(device) # Reset peak counter for the trial
        print(f"Initial GPU Memory: {torch.cuda.memory_allocated(device)/1024**2:.2f} MB Allocated, {torch.cuda.memory_reserved(device)/1024**2:.2f} MB Reserved")

    best_val_acc = 0.0
    history = {'train_loss': [], 'val_loss': [], 'val_acc': []}

    for epoch in range(EPOCHS):
        start_time_epoch = time.time()
        print(f"\nEpoch {epoch+1}/{EPOCHS}")

        # --- Training Phase ---
        model.train()
        running_loss_train = 0.0
        batches_processed_train = 0
        epoch_start_mem_alloc = torch.cuda.memory_allocated(device) if device == torch.device("cuda") else 0

        for i, batch in enumerate(train_loader):
            # Check if batch is valid (sometimes DataLoader yields None on error)
            if batch is None:
                print(f"Warning: DataLoader yielded None for batch {i}. Skipping.")
                continue
            inputs, labels = batch
            start_time_batch = time.time()
            inputs = inputs.to(device, non_blocking=True) # Use non_blocking with pin_memory
            labels = labels.to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True) # Use set_to_none=True for potential speedup

            try:
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                loss.backward()
                optimizer.step()

                running_loss_train += loss.item()
                batches_processed_train += 1

                if (i + 1) % 50 == 0: # Log less frequently during hyperparameter search
                    avg_loss_recent = running_loss_train / batches_processed_train if batches_processed_train > 0 else 0
                    batch_time = time.time() - start_time_batch
                    print(f'  Batch [{i+1:>4}/{len(train_loader):>4}] Train Loss: {loss.item():.4f} (Avg: {avg_loss_recent:.4f}) | Time/Batch: {batch_time:.3f}s')

                # Manual cleanup inside batch loop (more aggressive)
                del inputs, labels, outputs, loss
                # Note: Frequent cache emptying can slow down training but helps prevent OOM
                # if device == torch.device("cuda"):
                #     torch.cuda.empty_cache()

            except RuntimeError as e:
                if "out of memory" in str(e):
                    print(f"\nCUDA out of memory during training batch {i+1}!")
                    print(f"Batch size: {batch_size}, Model: ResNet18")
                    print(f"Memory Allocated: {torch.cuda.memory_allocated(device)/1024**2:.2f} MB")
                    print(f"Memory Reserved: {torch.cuda.memory_reserved(device)/1024**2:.2f} MB")
                    # Option 1: Skip the rest of the epoch/trial (might be cleaner)
                    # return {'error': 'OOM', 'config': config}
                    # Option 2: Try to recover (might be unstable)
                    print("Attempting to clear cache and skip batch...")
                    gc.collect()
                    if device == torch.device("cuda"): torch.cuda.empty_cache()
                    continue # Skip to next batch
                else:
                    raise e # Re-raise other runtime errors

        epoch_loss_train = running_loss_train / len(train_loader) if len(train_loader) > 0 else 0
        history['train_loss'].append(epoch_loss_train)
        epoch_end_mem_alloc = torch.cuda.memory_allocated(device) if device == torch.device("cuda") else 0
        print(f"Epoch {epoch+1} Training Phase Complete. Avg Train Loss: {epoch_loss_train:.4f}")
        if device == torch.device("cuda"):
             print(f"  GPU Mem Change (Train): {(epoch_end_mem_alloc - epoch_start_mem_alloc)/1024**2:+.2f} MB")


        # --- Validation Phase ---
        model.eval()
        running_loss_val = 0.0
        correct_val = 0
        total_val = 0
        epoch_start_mem_alloc_val = torch.cuda.memory_allocated(device) if device == torch.device("cuda") else 0

        with torch.no_grad():
            for i, batch in enumerate(test_loader):
                 if batch is None:
                     print(f"Warning: DataLoader yielded None for validation batch {i}. Skipping.")
                     continue
                 inputs, labels = batch
                 inputs = inputs.to(device, non_blocking=True)
                 labels = labels.to(device, non_blocking=True)

                 try:
                     outputs = model(inputs)
                     loss = criterion(outputs, labels)
                     running_loss_val += loss.item()

                     _, predicted = torch.max(outputs.data, 1)
                     total_val += labels.size(0)
                     correct_val += (predicted == labels).sum().item()

                     del inputs, labels, outputs, loss, predicted # Manual cleanup

                 except RuntimeError as e:
                    if "out of memory" in str(e):
                        print(f"\nCUDA out of memory during validation batch {i+1}!")
                        # Less critical than train OOM, but still problematic
                        print("Attempting to clear cache and skip batch...")
                        gc.collect()
                        if device == torch.device("cuda"): torch.cuda.empty_cache()
                        continue
                    else:
                        raise e

        epoch_loss_val = running_loss_val / len(test_loader) if len(test_loader) > 0 else 0
        epoch_acc_val = 100 * correct_val / total_val if total_val > 0 else 0
        history['val_loss'].append(epoch_loss_val)
        history['val_acc'].append(epoch_acc_val)

        if epoch_acc_val > best_val_acc:
            best_val_acc = epoch_acc_val

        end_time_epoch = time.time()
        epoch_duration = end_time_epoch - start_time_epoch
        epoch_end_mem_alloc_val = torch.cuda.memory_allocated(device) if device == torch.device("cuda") else 0


        print("-" * 50)
        print(f"Epoch {epoch+1} Summary:")
        print(f"  Train Loss: {epoch_loss_train:.4f}")
        print(f"  Val Loss:   {epoch_loss_val:.4f}")
        print(f"  Val Acc:    {epoch_acc_val:.2f}% (Best: {best_val_acc:.2f}%)")
        print(f"  Duration:   {epoch_duration:.2f}s")
        print(f"  RAM Usage: {psutil.virtual_memory().percent}% Used")
        if device == torch.device("cuda"):
            current_mem_alloc = torch.cuda.memory_allocated(device)
            current_mem_reserv = torch.cuda.memory_reserved(device)
            peak_mem_alloc = torch.cuda.max_memory_allocated(device) # Peak for the entire trial so far
            print(f"  GPU Mem Change (Val): {(epoch_end_mem_alloc_val - epoch_start_mem_alloc_val)/1024**2:+.2f} MB")
            print(f"  GPU Mem Current: {current_mem_alloc/1024**2:.2f} MB Allocated, {current_mem_reserv/1024**2:.2f} MB Reserved")
            print(f"  GPU Mem Peak (Trial): {peak_mem_alloc/1024**2:.2f} MB Allocated")
        print("-" * 50)

        # # Optional: More aggressive cleanup between epochs
        # gc.collect()
        # if device == torch.device("cuda"):
        #     torch.cuda.empty_cache()

    print(f"\n--- Finished Trial {trial_num+1} ---")
    print(f"Best Validation Accuracy for this trial: {best_val_acc:.2f}%")

    # Clean up explicitly before next trial
    del model, optimizer, criterion, train_loader, test_loader
    gc.collect()
    if device == torch.device("cuda"):
        torch.cuda.empty_cache()
        print(f"GPU Memory after trial cleanup: {torch.cuda.memory_allocated(device)/1024**2:.2f} MB Allocated, {torch.cuda.memory_reserved(device)/1024**2:.2f} MB Reserved")


    return {'config': config, 'best_val_acc': best_val_acc, 'history': history}


# --- Main Hyperparameter Search Loop ---
results = []

for trial_idx in range(NUM_TRIALS):
    # Sample random hyperparameters
    config = {
        'lr': random.choice(HP_SEARCH_SPACE['lr']),
        'batch_size': random.choice(HP_SEARCH_SPACE['batch_size']),
        'optimizer': random.choice(HP_SEARCH_SPACE['optimizer']),
    }
    if config['optimizer'] == 'SGD':
        config['sgd_momentum'] = random.choice(HP_SEARCH_SPACE['sgd_momentum'])
    else:
        # Assign a default/placeholder if momentum isn't used by the optimizer
        config['sgd_momentum'] = None # Or np.nan or similar


    # Run training and evaluation for the sampled config
    try:
         trial_result = train_evaluate(config, trial_idx)
         if trial_result and 'error' not in trial_result:
             results.append(trial_result)
         elif trial_result and 'error' in trial_result:
             print(f"Trial {trial_idx+1} failed with error: {trial_result['error']}")
             # Optionally store failed trials too:
             # results.append(trial_result)
    except Exception as e:
        print(f"\n!!!!!!!! CRITICAL ERROR IN TRIAL {trial_idx+1} !!!!!!!!")
        print(f"Config: {config}")
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        # Clean up GPU memory if possible after a crash
        del config # remove reference
        gc.collect()
        if device == torch.device("cuda"):
             torch.cuda.empty_cache()
        # Continue to the next trial
        continue


# --- Post-Search Analysis ---
print("\n\n" + "*"*70)
print("--- Hyperparameter Search Finished ---")
print(f"Ran {len(results)} successful trials.")

if not results:
    print("No trials completed successfully.")
else:
    # Sort results by best validation accuracy (descending)
    results.sort(key=lambda x: x.get('best_val_acc', -1), reverse=True) # Use .get for safety if a trial failed before producing acc

    print("\nTop 5 Configurations (by Best Validation Accuracy):")
    for i, result in enumerate(results[:5]):
        print(f"  Rank {i+1}:")
        print(f"    Config: {result['config']}")
        print(f"    Best Val Acc: {result.get('best_val_acc', 'N/A'):.2f}%") # Use .get again
        # Optionally print final epoch metrics
        if 'history' in result and result['history']['val_acc']:
             print(f"    Final Val Acc: {result['history']['val_acc'][-1]:.2f}%")
             print(f"    Final Train Loss: {result['history']['train_loss'][-1]:.4f}")
        print("-" * 20)

    best_config = results[0]['config']
    best_accuracy = results[0]['best_val_acc']
    print(f"\nBest Overall Configuration Found:")
    print(f"  Config: {best_config}")
    print(f"  Best Validation Accuracy: {best_accuracy:.2f}%")

print("*"*70)

Using device: cuda
GPU Name: NVIDIA GeForce RTX 3090 Ti
Loading dataset using ImageFolder...
Found 11788 images in 200 classes.
Splitting into Train (8841) and Test (2947) sets...
Dataset split complete.
Creating DataLoaders with Batch Size: 8, Num Workers: 4
DataLoaders created.
Loading ResNet18 model...
Modified final layer for 200 classes.
Model moved to device.
Loss function and optimizer defined.

--- Starting Training for 10 Epochs ---
Initial RAM Usage: 40.1% Used
Initial GPU Memory: 155.43 MB Allocated, 508.00 MB Reserved

Epoch 1/10


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "C:\Users\Michal\AppData\Local\pypoetry\Cache\virtualenvs\meta-project-hyzyb4Sq-py3.12\Lib\site-packages\torch\utils\data\_utils\worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Michal\AppData\Local\pypoetry\Cache\virtualenvs\meta-project-hyzyb4Sq-py3.12\Lib\site-packages\torch\utils\data\_utils\fetch.py", line 55, in fetch
    return self.collate_fn(data)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Michal\AppData\Local\pypoetry\Cache\virtualenvs\meta-project-hyzyb4Sq-py3.12\Lib\site-packages\torch\utils\data\_utils\collate.py", line 398, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Michal\AppData\Local\pypoetry\Cache\virtualenvs\meta-project-hyzyb4Sq-py3.12\Lib\site-packages\torch\utils\data\_utils\collate.py", line 212, in collate
    collate(samples, collate_fn_map=collate_fn_map)
  File "C:\Users\Michal\AppData\Local\pypoetry\Cache\virtualenvs\meta-project-hyzyb4Sq-py3.12\Lib\site-packages\torch\utils\data\_utils\collate.py", line 155, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Michal\AppData\Local\pypoetry\Cache\virtualenvs\meta-project-hyzyb4Sq-py3.12\Lib\site-packages\torch\utils\data\_utils\collate.py", line 272, in collate_tensor_fn
    return torch.stack(batch, 0, out=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: stack expects each tensor to be equal size, but got [3, 256, 341] at entry 0 and [3, 256, 384] at entry 1
