In [1]:
from RunningMultipleModels_with_logging import *

In [2]:
load_path = "/storage/mxg1065/datafiles"
shared_data = load_shared_data(load_path)
print(" Shared data loaded successfully!")

Loading shared data...
Loaded data shapes:
  scaled_data['data_0']: (187650, 3)
  unscaled_data['data_0']: (187650, 3)
  neighbor_pairs_list: (1250242, 2)
  labels_for_neighbor_pairs: (1000, 1250242)
 Shared data loaded successfully!


In [3]:
available_gpus = torch.cuda.device_count()
print(f"\n Available GPUs: {available_gpus}")
for i in range(available_gpus):
    print(f"   GPU {i}: {torch.cuda.get_device_name(i)} - ",
          f"{torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")



 Available GPUs: 4
   GPU 0: NVIDIA A30 -  23.5 GB
   GPU 1: NVIDIA A30 -  23.5 GB
   GPU 2: NVIDIA A30 -  23.5 GB
   GPU 3: NVIDIA A30 -  23.5 GB


In [4]:
# Base hyperparameters and defaults shared by all GPU configs
base_config = {
    'num_features': 3,
    'num_classes': 5,
    'hidden_dim': 128,
    'lr': 1e-3,
    'weight_decay': 5e-4,
    'epochs': 200,
    'batch_size': 1,
    'save_dir': "/storage/afarbin/training/models",
    'resume': True,
    'patience': 20,
    'delta': 0.0001,
    'debug': True,
    'weighted': False,
    'auto_convert_results': False,
    'save_comprehensive_table': True,
    'generator_flags': {  # defaults for the data generator
        'is_bi_directional': True,
        'train_ratio': 0.7
    },
    'model_flags': {     # defaults for the model
        'num_layers': 6,
        'layer_weights': False,
        'softmax': False
    }
}


In [5]:
# Define per-GPU configurations with small variations
configs = [
    # GPU 0
    {**base_config,
    'model_name': "fixed_generator_bs2_model.pt",
    'description': "Model with two events per batch",
    'batch_size': 2}
    #,

    # # GPU 1
    # {**base_config,
    #  'model_name': "nine_layer_model.pt",
    #  'description': "Model with nine layers",
    #  'model_flags': {**base_config['model_flags'], 'num_layers': 9}},

    # # GPU 2
    #{**base_config,
    # 'model_name': "twelve_layer_model.pt",
    # 'description': "Model with twelve layers",
    # 'model_flags': {**base_config['model_flags'], 'num_layers': 12}},

   # # GPU 3
   # {**base_config,
   #  'model_name': "fifteen_layer_model.pt",
   #  'description': "Model with fifteen layers",
   #  'model_flags': {**base_config['model_flags'], 'num_layers': 15}}
]

# Trim configs to match the number of detected GPUs
configs = configs[:available_gpus]

# Display planned training jobs for confirmation
print(f"\n Configuring {len(configs)} models:")
for i, config in enumerate(configs):
    print(f"   GPU {i}: {config['description']}")
    print(f"      → Model: {config['model_name']}")
    print(f"      → Hidden dim: {config['hidden_dim']}, LR: {config['lr']}")
    print(f"      → Weighted: {config['weighted']}")
    print(f"      → Auto-convert: {config['auto_convert_results']}")



 Configuring 1 models:
   GPU 0: Model with two events per batch
      → Model: fixed_generator_bs2_model.pt
      → Hidden dim: 128, LR: 0.001
      → Weighted: False
      → Auto-convert: False


In [None]:
gpu_id=0
print(f"   Clearing cache for GPU {gpu_id}...")
with torch.cuda.device(gpu_id):
    torch.cuda.empty_cache()
    torch.cuda.synchronize()  # ensure cleanup completes

# Show available memory for that GPU
    free_mem = torch.cuda.mem_get_info(gpu_id)[0] / 1024**3
    print(f"   GPU {gpu_id} free memory before start: {free_mem:.2f} GB")

    config = configs[0]
    result = train_single_gpu_with_logging(gpu_id, config, shared_data)


   Clearing cache for GPU 0...
   GPU 0 free memory before start: 23.27 GB


In [None]:


# Summarize training results by reading each GPU's metrics file
print("\n Training Summary:")
for i, config in enumerate(configs):
    model_path = os.path.join(config['save_dir'], config['model_name'])
    metrics_path = os.path.splitext(model_path)[0] + ".pkl"
    if os.path.exists(metrics_path):
        try:
            with open(metrics_path, 'rb') as f:
                metrics = pickle.load(f)
            best_acc = metrics.get('best_test_acc', 0)
            total_time = metrics.get('total_time', 0)
            min_time, sec_time = divmod(total_time, 60)
            print(f"   GPU {i}: {config['description']}")
            print(f"      Best Accuracy: {best_acc:.4f}")
            print(f"      Total Time: {int(min_time)}m {sec_time:.1f}s")
            print(f"      Model: {model_path}")
        except:
            print(f"   GPU {i}: Could not load metrics")
    else:
        print(f"   GPU {i}: No results found")

print("\n Multi-GPU training completed successfully!")

