In [1]:
!nvidia-smi

Sun Sep 22 06:30:21 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:03:00.0 Off |                    0 |
| N/A   30C    P0    50W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
pwd

'/global/u2/s/ssshukla/Shashank'

In [3]:
%cd ./scripts/LinearProbing/

/global/u2/s/ssshukla/Shashank/scripts/LinearProbing


In [4]:
from util import *
from data import *
from model import get_model

In [5]:
from train import *


In [6]:
def get_args_parser():
    parser = argparse.ArgumentParser('Masked Autoencoder ViT', add_help=False, allow_abbrev=False)

    # Model related arguments
    parser.add_argument('--model_name', default="base_mae_depthwise_convolution", choices=["base_mae_depthwise_convolution",
                                                                                           "channel_former",
                                                                                           "base_mae",
                                                                                           "conv_mae",
                                                                                           "cross_vit"],type=str, help='Model architecture to train')
    parser.add_argument('--img_size', default=125, type=int, help='Image size')
    parser.add_argument('--patch_size', default=5, type=int, help='Patch size')
    parser.add_argument('--in_chans', default=8, type=int, help='Number of input channels')
    parser.add_argument('--embed_dim', default=128, type=int, help='Embedding dimension')
    parser.add_argument('--depth', default=16, type=int, help='Depth of the encoder')
    parser.add_argument('--num_heads', default=8, type=int, help='Number of attention heads')
    parser.add_argument('--k_factor', default=16, type=int, help='Factor for convolution projection')

    # Decoder related arguments
    parser.add_argument('--decoder_embed_dim', default=128, type=int, help='Decoder embedding dimension')
    parser.add_argument('--decoder_depth', default=8, type=int, help='Decoder depth')
    parser.add_argument('--decoder_num_heads', default=8, type=int, help='Number of decoder heads')

    # Other arguments
    parser.add_argument('--mask_ratio', default=0.75, type=float, help='Masking ratio')
    parser.add_argument('--norm_layer', default=nn.LayerNorm, type=str, help='Normalization layer')
    parser.add_argument('--mlp_ratio', default=4, type=float, help='MLP ratio')
    parser.add_argument('--batch_size', default=128, type=int, help='Batch size')
    parser.add_argument('--learning_rate', default=0.00001, type=float, help='learning rate')
    parser.add_argument('--epochs', default=1, type=int, help='epochs')
    parser.add_argument('--save_every', default=1, type=int, help='How often to save a snapshot')
    parser.add_argument('--train_samples', default=-1, type=int, help='-1 indicates use samples for training')
    parser.add_argument('--resume_training', default=False, type=bool, help='Weather to resume from a checkpoint')
    parser.add_argument('--data_path', default='/pscratch/sd/s/ssshukla/Boosted_Top.h5', type=str, help='Path to the dataset')
    parser.add_argument('--warmup', type=int, default=3, ###This should be ~5-10% of total epochs
                        help='number of warmup epochs before reaching base_lr')
    return parser

In [7]:
parser = get_args_parser()
args, unknown = parser.parse_known_args()

In [8]:
model = get_model(args)

In [9]:
dataset = H5MaskedAutoEncoderDataset(h5_path = args.data_path, preload_size = args.batch_size)

In [10]:
dataloader = DataLoader(dataset,
                        batch_size=args.batch_size,
                        num_workers = 8,
                        pin_memory=True,
                        shuffle=True,
                        drop_last=True 
                       )

In [11]:
for step, batch in enumerate(dataloader):
    break

In [None]:
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
def model_train(model, epochs, train_dataloader, device):
    # Define optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=1.5e-4, weight_decay=0.05)
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=40, T_mult=2)

    # Lists to store training losses
    train_losses = []

    for epoch in range(epochs):
        train_loss = 0.0  # To accumulate training loss

        model.train()  # Set model to training mode
        for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs} (Train)', unit='batch'):
            images = batch['img'].to(device)  # Send images to device (GPU/CPU)
            
            optimizer.zero_grad()  # Zero out gradients
            loss, outputs, mask = model(images)  # Forward pass through the model
            loss = loss.sum()  # Sum the loss over the batch
            
            loss.backward()  # Backpropagation
            optimizer.step()  # Optimizer step
            
            train_loss += loss.item()  # Accumulate loss

        # Average training loss over the entire dataset
        train_loss /= len(train_dataloader)
        train_losses.append(train_loss)

        # Step the scheduler after each epoch
        scheduler.step()

        # Logging training loss
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}')
        
        # Save the loss to a file
        with open('losses.txt', 'a') as f:  # Open file in append mode
            f.write(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}\n')
        
        # Save the model after each epoch
        torch.save(model.state_dict(), f'./full_model_epoch_{epoch+1}.pth')

    # Return the list of training losses
    return train_losses

# Clean up memory (if needed)
gc.collect()
torch.cuda.empty_cache()

# Set the device to use
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assume the model and dataloader are already defined
model = model.to(DEVICE)  # Move the model to the appropriate device (GPU/CPU)

# Train the model
train_losses = model_train(model, 1, dataloader, DEVICE)

Epoch 1/1 (Train):   0%|          | 16/24725 [01:20<9:28:51,  1.38s/batch] 