In [1]:
from typing import Any, Optional, Tuple

from einops import rearrange
import torch
import torch.nn as nn
import torch.nn.functional as F

import hydra
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf
import torch
import tqdm
import torch.nn as nn
from tqdm import tqdm

from dataset import Batch
from models.kv_caching import KeysValues
from models.slicer import Embedder, Head
from models.tokenizer import Tokenizer
from models.transformer import Transformer, TransformerConfig
from utils import init_weights, LossWithIntermediateLosses

In [2]:
# Define dataset
import torch
import sys

import h5py
from PIL import Image
#import matplotlib.pyplot as plt
from datetime import datetime, timedelta
def eventGeneration(start_time, obs_time = 3 ,lead_time = 6, time_interval = 30):
    # Generate event based on starting time point, return a list: [[t-4,...,t-1,t], [t+1,...,t+72]]
    # Get the start year, month, day, hour, minute
    year = int(start_time[0:4])
    month = int(start_time[4:6])
    day = int(start_time[6:8])
    hour = int(start_time[8:10])
    minute = int(start_time[10:12])
    #print(datetime(year=year, month=month, day=day, hour=hour, minute=minute))
    times = [(datetime(year, month, day, hour, minute) + timedelta(minutes=time_interval * (x+1))) for x in range(lead_time)]
    lead = [dt.strftime('%Y%m%d%H%M') for dt in times]
    times = [(datetime(year, month, day, hour, minute) - timedelta(minutes=time_interval * x)) for x in range(obs_time)]
    obs = [dt.strftime('%Y%m%d%H%M') for dt in times]
    obs.reverse()
    return lead, obs

from torch.utils.data import Dataset, DataLoader
import h5py
import numpy as np
from torchvision.transforms import ToTensor, Compose, CenterCrop
class radarDataset(Dataset):
    def __init__(self, root_dir, event_times, obs_number = 3, pred_number = 6, transform=None):
        # event_times is an array of starting time t(string)
        # transform is the preprocessing functions
        self.root_dir = root_dir
        self.transform = transform
        self.event_times = event_times
        self.obs_number = obs_number
        self.pred_number = pred_number
    def __len__(self):
        return len(self.event_times)
    def __getitem__(self, idx):
        start_time = str(self.event_times[idx])
        time_list_pre, time_list_obs = eventGeneration(start_time, self.obs_number, self.pred_number)
        output = []
        time_list = time_list_obs + time_list_pre
        #print(time_list)
        for time in time_list:
            year = time[0:4]
            month = time[4:6]
            #path = self.root_dir + year + '/' + month + '/' + 'RAD_NL25_RAC_MFBS_EM_5min_' + time + '_NL.h5'
            path = self.root_dir + year + '/' + month + '/' + 'RAD_NL25_RAP_5min_' + time + '.h5'
            image = np.array(h5py.File(path)['image1']['image_data'])
            #image = np.ma.masked_where(image == 65535, image)
            image = image[264:520,242:498]
            image[image == 65535] = 0
            image = image.astype('float32')
            image = image/100*12
            image = np.clip(image, 0, 128)
            image = image/40
            #image = 2*image-1 #normalize to [-1,1]
            output.append(image)
        output = torch.permute(torch.tensor(np.array(output)), (1, 2, 0))
        output = self.transform(np.array(output))
        return output
#root_dir = '/users/hbi/data/RAD_NL25_RAC_MFBS_EM_5min/'
#dataset = radarDataset(root_dir, ["200808031600"], transform = Compose([ToTensor(),CenterCrop(256)]))

In [3]:
# develop dataset
from torch.cuda.amp import autocast
from torch.autograd import Variable
import pandas as pd
root_dir = '/home/hbi/RAD_NL25_RAP_5min/' 
batch_size=1

df_train = pd.read_csv('/space/zboucher/World_Model/catchment/training_Delfland08-14_20.csv', header = None)
event_times = df_train[0].to_list()
dataset_train = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))  

df_train_s = pd.read_csv('/space/zboucher/World_Model/catchment/training_Delfland08-14.csv', header = None)
event_times = df_train_s[0].to_list()
dataset_train_del = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))  

df_test = pd.read_csv('/space/zboucher/World_Model/catchment/testing_Delfland18-20.csv', header = None)
event_times = df_test[0].to_list()
dataset_test = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))

df_vali = pd.read_csv('/space/zboucher/World_Model/catchment/validation_Delfland15-17.csv', header = None)
event_times = df_vali[0].to_list()
dataset_vali = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))

df_train_aa = pd.read_csv('/space/zboucher/World_Model/catchment/training_Aa08-14.csv', header = None)
event_times = df_train_aa[0].to_list()
dataset_train_aa = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))  

df_train_dw = pd.read_csv('/space/zboucher/World_Model/catchment/training_Dwar08-14.csv', header = None)
event_times = df_train_dw[0].to_list()
dataset_train_dw = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))    

df_train_re = pd.read_csv('/space/zboucher/World_Model/catchment/training_Regge08-14.csv', header = None)
event_times = df_train_re[0].to_list()
dataset_train_re = radarDataset(root_dir, event_times, transform = Compose([ToTensor()]))   

data_list = [dataset_train_aa, dataset_train_dw, dataset_train_del, dataset_train_re]
train_aadedwre = torch.utils.data.ConcatDataset(data_list)

print(len(dataset_train), len(dataset_test), len(dataset_vali))
loaders = { 'train' :DataLoader(train_aadedwre, batch_size, shuffle=True, num_workers=8),
            'test' :DataLoader(dataset_test, batch_size, shuffle=False, num_workers=8), 
           'valid' :DataLoader(dataset_vali, batch_size, shuffle=False, num_workers=8),
          
          'train_aa5' :DataLoader(dataset_train_aa, batch_size, shuffle=False, num_workers=8),
          'train_dw5' :DataLoader(dataset_train_dw, batch_size, shuffle=False, num_workers=8),
          'train_del5' :DataLoader(dataset_train_del, batch_size, shuffle=True, num_workers=8),
          'train_re5' :DataLoader(dataset_train_re, batch_size, shuffle=False, num_workers=8),
          }

32183 3493 3560


In [4]:
from utils import configure_optimizer, EpisodeDirManager, set_seed
config =OmegaConf.load('/users/zboucher/world/World_Model/config/trainer.yaml')
cfg=config
# Access the configuration and perform further operations
# For example, print the contents of the configuration
print(config)
print()

if config.common.seed is not None:
        set_seed(config.common.seed)

print(set_seed)

{'defaults': ['_self_', {'tokenizer': 'default'}, {'world_model': 'default'}], 'wandb': {'mode': 'online', 'project': 'iris', 'entity': None, 'name': None, 'group': None, 'tags': None, 'notes': None}, 'initialization': {'path_to_checkpoint': None, 'load_tokenizer': False, 'load_world_model': False}, 'common': {'epochs': 1, 'device': 'cuda:1', 'do_checkpoint': True, 'seed': 0, 'sequence_length': '${world_model.max_blocks}', 'resume': False}, 'training': {'should': True, 'learning_rate': 0.0001, 'sampling_weights': [0.125, 0.125, 0.25, 0.5], 'tokenizer': {'batch_num_samples': 4, 'grad_acc_steps': 1, 'max_grad_norm': 10.0}, 'world_model': {'batch_num_samples': 4, 'grad_acc_steps': 1, 'max_grad_norm': 10.0, 'weight_decay': 0.01}}, 'evaluation': {'should': True, 'every': 5, 'tokenizer': {'batch_num_samples': '${training.tokenizer.batch_num_samples}', 'start_after_epochs': '${training.tokenizer.start_after_epochs}', 'save_reconstructions': True}, 'world_model': {'batch_num_samples': '${train

In [5]:
import hydra
from omegaconf import OmegaConf

@hydra.main(config_path='/space/zboucher/iris_1/config/', config_name='trainer.yaml')
def config_function(cfg):
    # Access the configuration
    config_file = OmegaConf.load(cfg)
  
    return config_file


cfg_worldmodel = config_function('/users/zboucher/world/World_Model/config/world_model/default.yaml')
cfg_tokenizer = config_function('/users/zboucher/world/World_Model/config/tokenizer/default.yaml')

In [6]:
device = torch.device(cfg.common.device)
print(device)

cuda:1


In [7]:
#from models.transformer import Transformer, TransformerConfig
##config = TransformerConfig(tokens_per_block=9, max_blocks=1, attention="causal", num_layers=6, num_heads=8, embed_dim=256, embed_pdrop=0.1, resid_pdrop=0.1, attn_pdrop=0.1)
#transformer = Transformer(config)
#head_observations = nn.Linear(256, 1024)
#from models.world_model import WorldModel
#tokenizer = instantiate(cfg_tokenizer)
            

#print(transformer)

In [8]:
from utils import configure_optimizer, EpisodeDirManager, set_seed
from models.world_model import WorldModel
tokenizer = instantiate(cfg_tokenizer)

world_model = WorldModel(obs_vocab_size=tokenizer.vocab_size,config=instantiate(cfg_worldmodel))
world_model.to(device)

optimizer_trans = configure_optimizer(world_model, cfg.training.learning_rate, cfg.training.world_model.weight_decay)

Tokenizer : shape of latent is (256, 16, 16).




In [9]:
checkpoint1 = torch.load('/space/zboucher/iris_1/src/checkpoint/transformer_14', map_location=device)
print(checkpoint1.keys())
world_model.load_state_dict(checkpoint1['model_state_dict'])

dict_keys(['model_state_dict', 'optimizer_state_dict'])


<All keys matched successfully>

In [12]:
# Load the latent space indices from the saved file
loaded_latent_space = torch.load('/space/zboucher/iris_1/src/checkpoint/latent_space.pt')
epoch=4


In [13]:

# Open a file in write mode
with open('/space/zboucher/iris_1/src/checkpoint/save_loss_tran.txt', 'w') as file:
    for epoch in range(10):
        save_epoch = epoch in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        #save_epoch = epoch in [2]
        loss_total_epoch=0.0
        optimizer_trans.zero_grad()
        intermediate_losses = {}
        print("epoch {}".format(epoch)) 
        loss_total_step=0
        
            
        
        i=0
        for i, latent_space in enumerate(loaded_latent_space):
            
            #print(f"Batch {i} - Latent Space Shape:", latent_space.size())
        
            input_image = latent_space.to('cuda:1')
            #print(input_image.size())

            losses = world_model.compute_loss(input_image[:, :])
            
            
            loss_total_step += losses.loss_total 
            #print(loss_total_step)
            # loss_total_epoch += loss_total_step.item()
            if (i) % 16 == 0:
                (loss_total_step/16).backward(retain_graph=True)
                optimizer_trans.step()
                optimizer_trans.zero_grad()
                
                print("Losses: Total = {:.4f}".format((loss_total_step/16).item()))
                torch.cuda.empty_cache()
                file.write("{}\n".format(loss_total_step/16))
                
            
                #for loss_name, loss_value in losses.intermediate_losses.items():
                   # intermediate_losses[f"{str(world_model)}/train/{loss_name}"] = loss_value/16
                   # file.write("{}\n".format(loss_total_step.item()))
                
                loss_total_step=0
        
            
            # metrics = {f'{str(world_model)}/train/total_loss': loss_total_epoch, **intermediate_losses}
        #print("Epoch {}: Total Loss = {:.4f}".format(epoch, metrics[f'{str(world_model)}/train/total_loss']))

        if save_epoch:
            torch.save({
            'model_state_dict': world_model.state_dict(),
            'optimizer_state_dict': optimizer_trans.state_dict(),
            }, '/space/zboucher/iris_1/src/checkpoint/transformer_{}'.format(epoch+11))



epoch 0
Losses: Total = 0.0668


OutOfMemoryError: CUDA out of memory. Tried to allocate 162.00 MiB (GPU 1; 47.54 GiB total capacity; 46.88 GiB already allocated; 100.81 MiB free; 47.12 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF