In [None]:
from torchgeo.trainers import PixelwiseRegressionTask
import torch
import pytorch_lightning as pl
import numpy as np
import rasterio
import cv2
import logging
from typing import List
import wandb
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
import torch.nn as nn
import os
from utils.model import LSTNowcaster
from utils.data.TiledLandsatDataModule import TiledLandsatDataModule
from utils.voice import notifySelf, PercentageProgressCallback
# from pytorch_lightning.loggers import TensorBoardLogger
# from pytorch_lightning.profilers import PyTorchProfiler
torch.cuda.empty_cache()

os.environ["WANDB_NOTEBOOK_NAME"] = "TrainUNet-Basic.ipynb"
os.environ["WANDB_DIR"] = "./wandb"
os.environ["WANDB_CACHE_DIR"] = "./wandb/.cache/wandb"
os.environ["WANDB_CONFIG_DIR"] = "./wandb/.config/wandb"
os.environ["WANDB_DATA_DIR"] = "./wandb/.cache/wandb-data"
os.environ["WANDB_ARTIFACT_DIR"] = "./wandb/artifacts"

config = {
    "experiment_name": "Test normalization fixes",
    "debug": True,
    "by_city": False,
    "months_ahead": 0,
    "tile_size": 512,
    "tile_overlap": 0.0,
    "learning_rate": 1e-4,
    "model": "unet",
    "backbone": "resnet50",
    "dataset": "pure_landsat",
    "epochs": 25,
    "batch_size": 32,
    "pretrained_weights": True,
    "deterministic": True,
    "in_channels": 5
}

# logger = TensorBoardLogger("tb_logs", name="unet_profile")
# profiler = PyTorchProfiler(
#     on_trace_ready=torch.profiler.tensorboard_trace_handler("tb_logs/profiler0"),
#     schedule=torch.profiler.schedule(skip_first=10, wait=1, warmup=1, active=20)
# )

wandb_logger = WandbLogger(
    project="heat-island",  # your project name
    name=config['experiment_name'],  # name of this particular run
    log_model="best",  # log model checkpoints
    save_code=True,
    save_dir="./wandb",  # where to save the logs locally
)
wandb_logger.log_hyperparams(config)
if config["dataset"] == "pure_landsat":
    data_module = TiledLandsatDataModule(
        data_dir="./Data",
        monthsAhead=config["months_ahead"],
        batch_size=config["batch_size"],
        num_workers=2,
        byCity=config["by_city"],
        debug=config["debug"],
        tile_size=config["tile_size"],
        tile_overlap=config["tile_overlap"]
    )
    data_module.setup()

percentage_callback = PercentageProgressCallback(total_epochs=config["epochs"], experiment_name=config["experiment_name"])
checkpoint_callback = ModelCheckpoint(
    dirpath="./wandb/heat-island/checkpoints", monitor="val_rmse_F", mode="min", save_top_k=3, every_n_epochs=1, save_last=True
)

# Initialize trainer with explicit steps
trainer = pl.Trainer(
    max_epochs=config["epochs"],
    gradient_clip_val=0.5,
    log_every_n_steps=10,
    enable_progress_bar=True,
    enable_model_summary=False,
    deterministic=config["deterministic"],
    num_sanity_val_steps=2,
    reload_dataloaders_every_n_epochs=1,
    logger=wandb_logger,
    # profiler=profiler,
    # logger=logger,
    callbacks=[checkpoint_callback, percentage_callback]
)

In [None]:
from utils.model import LSTNowcaster
model = LSTNowcaster(model=config["model"], backbone=config["backbone"], in_channels=config["in_channels"], learning_rate=config["learning_rate"], pretrained_weights=config["pretrained_weights"])

In [None]:
trainer.fit(model=model, datamodule=data_module)

In [None]:
# After training, load the best checkpoint for testing
best_model_path = checkpoint_callback.best_model_path
print(f"Best model path: {best_model_path}")

if best_model_path:
    # Load best checkpoint
    best_model = LSTNowcaster.load_from_checkpoint(
        checkpoint_path=best_model_path,
        model=config["model"],
        backbone=config["backbone"],
        in_channels=config["in_channels"],
        learning_rate=config["learning_rate"],
        pretrained_weights=config["pretrained_weights"]
    )
    
    # Test using the best model
    trainer.test(model=best_model, datamodule=data_module)
else:
    print("No checkpoint found, testing with the current model state")
    trainer.test(model=model, datamodule=data_module)

In [None]:
from torchgeo.trainers import PixelwiseRegressionTask
import torch
import pytorch_lightning as pl
import numpy as np
import rasterio
import cv2
import logging
from typing import List
import wandb
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, Callback
import torch.nn as nn
import os
from datetime import datetime
from utils.model import LSTNowcaster
from utils.data.TiledLandsatDataModule import TiledLandsatDataModule
from utils.voice import notifySelf
torch.cuda.empty_cache()
torch.set_float32_matmul_precision('high')

os.environ["WANDB_NOTEBOOK_NAME"] = "TrainUNet-Basic.ipynb"
os.environ["WANDB_DIR"] = "./wandb"
os.environ["WANDB_CACHE_DIR"] = "./wandb/.cache/wandb"
os.environ["WANDB_CONFIG_DIR"] = "./wandb/.config/wandb"
os.environ["WANDB_DATA_DIR"] = "./wandb/.cache/wandb-data"
os.environ["WANDB_ARTIFACT_DIR"] = "./wandb/artifacts"

config = {
    "experiment_name": "Testing OneFormer",
    "debug": True,
    "by_city": False,
    "months_ahead": 0,
    "tile_size": 128,
    "tile_overlap": 0.0,
    "learning_rate": 1e-4,
    "model": "oneformer",
    "backbone": "b5",
    "dataset": "pure_landsat",
    "augment": True,
    "epochs": 100,
    "batch_size": 64,
    "pretrained_weights": True,
    "deterministic": True,
    "random_seed_by_scene": 1,
    "in_channels": 6,
    "only_train": False,
    "skip_years": []
}

model = LSTNowcaster(
    model=config["model"], 
    backbone=config["backbone"], 
    in_channels=config["in_channels"], 
    learning_rate=config["learning_rate"], 
    pretrained_weights=config["pretrained_weights"]
)

X = torch.randn(1, 6, 128, 128)  # Example input tensor
y = model(X)  # Forward pass through the model

input_names = ["LST"]
output_names = ["LST Prediction"]
# torch.onnx.export(model, X, "model.onnx", input_names=input_names, output_names=output_names)

def print_model_structure(model):
    """Print the model structure in a hierarchical format"""
    print("Model Structure:")
    for name, module in model.named_modules():
        # Skip the model itself
        if name == '':
            continue
            
        # Calculate the indent level based on the number of dots in the name
        indent = '  ' * name.count('.')
        module_type = module.__class__.__name__
        
        # Print the module with indentation
        print(f"{indent}├─ {name.split('.')[-1]} ({module_type})")
        
        # If it has important parameters, print them
        if hasattr(module, 'in_channels') and hasattr(module, 'out_channels'):
            print(f"{indent}│  ├─ in_channels: {module.in_channels}")
            print(f"{indent}│  └─ out_channels: {module.out_channels}")
        elif hasattr(module, 'in_features') and hasattr(module, 'out_features'):
            print(f"{indent}│  ├─ in_features: {module.in_features}")
            print(f"{indent}│  └─ out_features: {module.out_features}")

# Print the model structure
print_model_structure(model)

from torchinfo import summary
summary(model, input_size=(config["batch_size"], config["in_channels"], config["tile_size"], config["tile_size"]))

In [None]:
# %%
from torchgeo.trainers import PixelwiseRegressionTask
import torch
import pytorch_lightning as pl
import numpy as np
import rasterio
import cv2
import logging
from typing import List
import wandb
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, Callback
import torch.nn as nn
import os
from datetime import datetime
from utils.model import LSTNowcaster
from utils.data.TiledLandsatDataModule import TiledLandsatDataModule
from utils.voice import notifySelf
# from pytorch_lightning.loggers import TensorBoardLogger
# from pytorch_lightning.profilers import PyTorchProfiler
torch.cuda.empty_cache()
torch.set_float32_matmul_precision('high')

os.environ["WANDB_NOTEBOOK_NAME"] = "TrainUNet-Basic.ipynb"
os.environ["WANDB_DIR"] = "./wandb"
os.environ["WANDB_CACHE_DIR"] = "./wandb/.cache/wandb"
os.environ["WANDB_CONFIG_DIR"] = "./wandb/.config/wandb"
os.environ["WANDB_DATA_DIR"] = "./wandb/.cache/wandb-data"
os.environ["WANDB_ARTIFACT_DIR"] = "./wandb/artifacts"

config = {
    "experiment_name": "7 channels 3 months debug segformer",
    "debug": True,
    "by_city": False,
    "months_ahead": 1,
    "tile_size": 128,
    "tile_overlap": 0.0,
    "learning_rate": 1e-4,
    "model": "segformer",
    "backbone": "b5",
    "dataset": "pure_landsat",
    "augment": True,
    "epochs": 200,
    "batch_size": 128,
    "pretrained_weights": True,
    "deterministic": True,
    "random_seed_by_scene": 1,
    "in_channels": 7,
    "only_train": False,
    "skip_years": []
}
i = -1
wandb_logger = WandbLogger(
    project="heat-island",
    name=config['experiment_name'],
    log_model="best",
    save_code=True,
    save_dir="./wandb",
)
wandb_logger.log_hyperparams(config)    

# Create model
model = LSTNowcaster(
    model=config["model"], 
    backbone=config["backbone"], 
    in_channels=config["in_channels"], 
    learning_rate=config["learning_rate"], 
    pretrained_weights=config["pretrained_weights"]
)

class PercentageProgressCallback(Callback):
    def __init__(self, total_epochs, experiment_name):
        super().__init__()
        self.total_epochs = total_epochs
        self.experiment_name = experiment_name

    def on_train_epoch_end(self, trainer, pl_module):
        # Only run on main process
        if trainer.is_global_zero:
            current_epoch = trainer.current_epoch
            if current_epoch % 20 == 0:
                current_percentage = min(100, int(current_epoch / self.total_epochs * 100))
                wandb.alert(title="Training Update", 
                        text=f'{self.experiment_name} is at {current_percentage:.2f}%', 
                        level=wandb.AlertLevel.INFO)

percentage_callback = PercentageProgressCallback(total_epochs=config["epochs"], experiment_name=config["experiment_name"])    
wandb_run_id = wandb_logger.experiment.id    
current_date = datetime.now()                
date_string = current_date.strftime("%B%d")
checkpoint_callback = ModelCheckpoint(
    dirpath=f"./wandb/heat-island/checkpoints/{wandb_run_id}_{date_string}",
    filename= f"{wandb_run_id}_{date_string}_" + "{epoch:03d}_{val_rmse_F:.4f}",
    monitor="val_rmse_p",
    mode="min",
    save_top_k=1,
    every_n_epochs=1,
    save_last=False  # Also save the last model for comparison
)
allYears = ["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
for year in config["skip_years"]:
    allYears.remove(year)
# for subYears in [allYears[:5], allYears[5:]]:
trainer = pl.Trainer(
    max_epochs=config['epochs'],
    gradient_clip_val=0.5,
    log_every_n_steps=10,
    enable_progress_bar=True,
    enable_model_summary=False,
    # deterministic=config["deterministic"],
    num_sanity_val_steps=2,
    logger=wandb_logger,
    callbacks=[checkpoint_callback, percentage_callback],
    # devices=4,                         # Use all 4 GPUs
    accelerator="gpu",                 # Use GPU acceleration
    # strategy="ddp",                    # Use DistributedDataParallel
    precision="16-mixed"               # Add mixed precision for memory efficiency
)                             

data_module = TiledLandsatDataModule(
    data_dir="./Data",
    monthsAhead=config["months_ahead"],
    batch_size=config["batch_size"],
    num_workers=8,
    byCity=config["by_city"],
    debug=config["debug"],
    tile_size=config["tile_size"],
    tile_overlap=config["tile_overlap"],
    augment=config["augment"],
    seedForScene=config["random_seed_by_scene"],
    onlyTrain = config["only_train"],
    includeYears=allYears
)
data_module.setup()

# Train model
trainer.fit(model=model, datamodule=data_module)
del trainer
del data_module
# Force garbage collection and clear CUDA cache
import gc
gc.collect()
torch.cuda.empty_cache()
# After deleting objects
for i in range(torch.cuda.device_count()):
    with torch.cuda.device(i):
        torch.cuda.empty_cache()

# Register the best model as a W&B artifact
best_model_path = checkpoint_callback.best_model_path
if best_model_path and os.path.exists(best_model_path):
    artifact = wandb.Artifact(
        name=f"{best_model_path.split('/')[-1].replace('=','.')}", 
        type="model",
        description=f"Best model at {best_model_path.split('/')[-1]}" 
    )
    artifact.add_file(best_model_path)
    wandb_logger.experiment.log_artifact(artifact)

# End Experiment
wandb.finish()
notifySelf(f"Finished {config['experiment_name']}...")
del model
del wandb_logger
del checkpoint_callback

# Force garbage collection and clear CUDA cache
import gc
for obj in gc.get_objects():   
    try:
        if torch.is_tensor(obj) and obj.device.type == 'cuda':
            del obj
    except:
        pass
gc.collect()

# After deleting objects
for j in range(torch.cuda.device_count()):
    with torch.cuda.device(j):
        x = torch.zeros(1024, 1024, 1024, device=f'cuda:{j}')
        del x
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.reset_accumulated_memory_stats()

# 4. Wait for GPU processes to complete
torch.cuda.synchronize()

# Print memory stats for debugging
if torch.cuda.is_available():
    print(f"Loop {i} completed. CUDA memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"CUDA memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
notifySelf("Batch experiment ended.")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjesus-guerrero[0m ([33mjesus-guerrero-ml[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Gathering scenes (Sort by Random Scene)...: 100%|██████████| 35301/35301 [00:00<00:00, 1755633.72it/s]
Preparing scene by scene...: 100%|██████████| 405/405 [00:00<00:00, 5241.60it/s]


Dataset splits - Train: 324, Val: 40, Test: 41


Gathering scenes (Sort by Random Scene)...: 100%|██████████| 35301/35301 [00:00<00:00, 1738074.91it/s]
Preparing scene by scene...: 100%|██████████| 405/405 [00:00<00:00, 10052.87it/s]
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Dataset splits - Train: 324, Val: 40, Test: 41


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=False):


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:
Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x731c66967550>> (for post_run_cell), with arguments args (<ExecutionResult object at 731e1ff47f70, execution_count=1 error_before_exec=None error_in_exec=name 'exit' is not defined info=<ExecutionInfo object at 731e1ff47f40, raw_cell="# %%
from torchgeo.trainers import PixelwiseRegres.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2Bjet/home/ubuntu/heat-island-test/TrainUNet-Basic.ipynb#W6sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [None]:
#Test here
# %%
# Test the trained model
import os
import torch
import wandb
from pytorch_lightning.loggers import WandbLogger
from utils.model import LSTNowcaster
from utils.data.TiledLandsatDataModule import TiledLandsatDataModule

# Define which model checkpoint to test
# You can either specify a specific checkpoint or use the best one from a previous run
for checkpoint_path in [
    "/home/ubuntu/heat-island-test/wandb/heat-island/checkpoints/up47iayb_April15/up47iayb_April15_epoch=059_val_rmse_F=17.0594.ckpt"
]:

    # Initialize test configuration
    test_config = {
        "experiment_name": "Test OneFormer Debug",
        "debug": True,  # Set to False for full test
        "by_city": False,
        "months_ahead": 3,
        "tile_size": 128,
        "tile_overlap": 0.0,
        "model": "segformer",
        "backbone": "b5",
        "dataset": "pure_landsat",
        "batch_size": 1,  # Can be larger than training since no gradients are stored
        "in_channels": 6
    }

    # Get the run ID from your checkpoint path
    run_id = checkpoint_path.split('/')[-2].split('_')[0]  # Extracts the run ID from the checkpoint path

    # Initialize WandB logger that continues the same run
    test_logger = WandbLogger(
        project="heat-island",
        id=run_id,  # Use the same run ID to continue logging to the same run
        resume="must",  # Force resume the existing run
        save_dir="./wandb",
    )

    # Set up data module for testing
    data_module = TiledLandsatDataModule(
        data_dir="./Data",
        monthsAhead=test_config["months_ahead"],
        batch_size=test_config["batch_size"],
        num_workers=4,
        byCity=test_config["by_city"],
        debug=test_config["debug"],
        tile_size=test_config["tile_size"],
        tile_overlap=test_config["tile_overlap"],
        augment=False,  # No augmentation during testing
        seedForScene=1,  # Consistent seed for reproducibility
        includeYears=["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
    )
    data_module.setup()  # Explicitly prepare the test data

    # Initialize the model with the same architecture used during training
    model = LSTNowcaster.load_from_checkpoint(
        checkpoint_path,
        model=test_config["model"],
        backbone=test_config["backbone"],
        in_channels=test_config["in_channels"]
    )

    # Set model to evaluation mode
    model.eval()

    # Initialize trainer specifically for testing
    from pytorch_lightning import Trainer
    test_trainer = Trainer(
        logger=test_logger,
        enable_progress_bar=True,
        enable_model_summary=True,
        deterministic=True
    )

    # Run test
    test_results = test_trainer.test(model=model, datamodule=data_module)

    # Log detailed test metrics
    test_logger.experiment.log({
        "test_results": test_results[0],
        "test_rmse_F": test_results[0].get("test_rmse_F", None),
        "test_mae_F": test_results[0].get("test_mae_F", None)
    })

    # Optional: Visualize some test predictions
    # This would require implementing a callback or method to save and log images

    # Clean up resources
    del model
    del test_trainer
    del data_module
    del test_logger

    # Force garbage collection and clear CUDA cache
    import gc
    gc.collect()
    torch.cuda.empty_cache()

    print(f"Test complete. Results: {test_results}")
    wandb.finish()