## Check the running training job by code

In [2]:
import boto3

# Create a SageMaker client
sm_client = boto3.client('sagemaker')

# List training jobs that are currently in progress
response = sm_client.list_training_jobs(
    StatusEquals='InProgress',        # Filter for jobs still running
    SortBy='CreationTime',            # Sort by time created
    SortOrder='Descending',           # Most recent first
    MaxResults=10                     # Limit the number of results
)

print("Currently running SageMaker training jobs:")
for job in response['TrainingJobSummaries']:
    print(f"- {job['TrainingJobName']} | Created: {job['CreationTime']}")


Currently running SageMaker training jobs:


## Manual Check AWS Sagemaker Training Dashboard

https://console.aws.amazon.com/sagemaker/

## Check the saved model

In [4]:
import os
import tarfile
import boto3
import torch
!pip install einops
# Import your model class definition
from model import ViT_MultiTask  # Update this path if your model is elsewhere

# ---------------------------------------------
# Step 1: Set S3 bucket and model path
# ---------------------------------------------
bucket_name = 'rnd-sagemaker'
s3_key = 'Foundation_Model_Log/wifi-ssl-training-2025-04-10-20-18-05-201/output/model.tar.gz'

# Local temporary directory
local_model_tar = '/tmp/model.tar.gz'
extracted_model_dir = '/tmp/model'

# Create the directory if not exists
os.makedirs(extracted_model_dir, exist_ok=True)

# ---------------------------------------------
# Step 2: Download the model tar.gz from S3
# ---------------------------------------------
s3 = boto3.client('s3')
print(f"Downloading from s3://{bucket_name}/{s3_key} ...")
s3.download_file(bucket_name, s3_key, local_model_tar)
print("Download complete.")

# ---------------------------------------------
# Step 3: Extract the tar.gz file
# ---------------------------------------------
with tarfile.open(local_model_tar, 'r:gz') as tar:
    tar.extractall(path=extracted_model_dir)
print(f"Model extracted to: {extracted_model_dir}")

# ---------------------------------------------
# Step 4: Load the model checkpoint
# ---------------------------------------------
# Update this path if the checkpoint name is different
checkpoint_path = os.path.join(extracted_model_dir, 'test_ssl_run/ViT/best_model_checkpoint_ssl.pth.tar')
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))

# Rebuild the model structure exactly as in training
model = ViT_MultiTask(
    emb_dim=128,
    encoder_heads=4,
    encoder_layers=6,
    encoder_ff_dim=512,
    encoder_dropout=0.1,
    recon_heads=4,
    recon_layers=3,
    recon_ff_dim=512,
    recon_dropout=0.1,
    num_classes=3,
    c_out=16,
    freq_out=10,
    max_len=512
)

# Load the trained weights
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()  # Set model to evaluation mode

print("Model loaded and ready for inference.")


Downloading from s3://rnd-sagemaker/Foundation_Model_Log/wifi-ssl-training-2025-04-10-20-18-05-201/output/model.tar.gz ...
Download complete.
Model extracted to: /tmp/model
✅ Model loaded and ready for inference.


  checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
