# PEATMAP NETWORK TRAINING NOTEBOOK

In [1]:
# save code to exec_peatnet.py
#%%writefile exec_peatnet_2.0.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import datetime
import logging
import argparse

from scipy.stats import boxcox

from numba import jit

import torch
import torch.nn as nn
import torch.utils.data as Data
from torch.utils.data import TensorDataset
import torch.optim.lr_scheduler as lr_scheduler

from sklearn.model_selection import train_test_split

from peatnet import *
from utils import *
from libCarbonFootprint import *


2024-07-11 18:17:13.285234: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-11 18:17:13.287746: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-11 18:17:13.317908: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Training cell

In [10]:
# save code to exec_peatnet.py
#%%writefile exec_peatnet_2.0.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import datetime
import logging
import argparse

from scipy.stats import boxcox

from numba import jit

import torch
import torch.nn as nn
import torch.utils.data as Data
from torch.utils.data import TensorDataset
import torch.optim.lr_scheduler as lr_scheduler

from sklearn.model_selection import train_test_split

from peatnet import *
from utils import *
from libCarbonFootprint import *





learn_rate = 0.001          # Learning rate
num_epochs = 5              # Number of epochs
nb_file2merge = 5           # Number of files to merge
frac_samples = 0.10         # Fraction of the data to extract
normalize = False            # Normalize the data
verbose = True              # Verbose mode

model_dir = "../peatnet_models"


carbon_estimation = True    # Estimate the carbon footprint
carbon_log_file = "carbon_footprint.log"
training_log_file = "peatnet_training"

data_dir = "/home/gsainton/CALER/PEATMAP/1_NN_training/training_data" if os.uname().nodename == 'ares6' else "/data/gsainton/PEATLAND_DATA"
# --------------------------------------------------------------------------

logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler(os.path.join(f"training_log_file_{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}.log")),
                        logging.StreamHandler()
                    ])
logger = logging.getLogger(__name__)

if verbose:
    logger.setLevel(logging.DEBUG)
else:
    logger.setLevel(logging.INFO)

def setup_device(mydevice:str) -> torch.device:

    if not torch.cuda.is_available():
        logger.error("GPU is not available -> device = 'cpu'...")
        device = torch.device('cpu')
    else:
        logger.info("GPU found...")
        num_gpus = torch.cuda.device_count()
        for i in range(num_gpus):
            logger.info(f"Device {i} name: {torch.cuda.get_device_name(i)}")
        
        valid_devices = [f'cuda:{i}' for i in range(num_gpus)]
        logger.info(f"Valid GPU references: {valid_devices}")
        
        if torch.cuda.device_count() > 1:
            # Check the string format of mydevice
            if mydevice in [f'cuda:{i}' for i in range(torch.cuda.device_count())]:
                logger.info(f"Using GPU - {mydevice}")
                device = torch.device(mydevice)
            else:
                logger.error(f"Invalid GPU reference: {mydevice}. Exiting...")
                sys.exit(1)
        else:
            if mydevice != 'cuda:0':
                logger.error(f"Invalid GPU reference: {mydevice}. Exiting...")
                sys.exit(1)
            else:
                logger.info("Using a single GPU : cuda:0")
                device = torch.device('cuda:0')
    return device

if __name__ == '__main__':

    # Get argument from the command line
    parser = argparse.ArgumentParser(description='Train a neural network to predict peatland')
    parser.add_argument('--num_epochs', type=int, default=2, help='Number of epochs')
    parser.add_argument('--nb_file2merge', type=int, default=2, help='Number of files to merge')
    parser.add_argument('--frac_samples', type=float, default=0.10, help='Fraction of the data to extract')
    parser.add_argument('--gpu_ref', type=str, default='cuda:0', help='GPU reference')
    #args = parser.parse_args()

    # num_epochs = args.num_epochs
    # nb_file2merge = args.nb_file2merge
    # frac_samples = args.frac_samples
    # mydevice = torch.device(args.gpu_ref)
    mydevice = 'cuda:0'

    if frac_samples > 1 or frac_samples < 0:
        raise ValueError("frac_samples must be between 0 and 1")
    
    # Exemple of command line:
    # python exec_peatnet.py --num_epochs 2 --nb_file2merge 2 --frac_samples 0.10

    carbon_log_dir = "/home/gsainton/CARBON_LOG" if os.uname().nodename == 'ares6' else "/obs/gsainton/PEATLAND_DATA"

    if carbon_estimation:
        start = datetime.datetime.now()
    if not os.path.exists(carbon_log_dir):
        
        os.makedirs(carbon_log_dir)

    device = setup_device(mydevice)

    


2024-07-12 15:12:06,650 - __main__ - INFO - GPU found...
2024-07-12 15:12:06,651 - __main__ - INFO - Device 0 name: NVIDIA RTX 3000 Ada Generation Laptop GPU
2024-07-12 15:12:06,651 - __main__ - INFO - Valid GPU references: ['cuda:0']
2024-07-12 15:12:06,652 - __main__ - ERROR - Invalid GPU reference: cuda:12. Exiting...


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:

    peatmat_data_proc = PeatNetDataProc(data_dir=data_dir, frac_samples=frac_samples)

    peatmat_data_proc.set_list_rdn_files(nb_file2merge)
    sub_sampled_data = peatmat_data_proc.get_list_rdn_files()
   
    X, y = peatmat_data_proc.load_data()
    logging.info("Number of tiles to merge : {}".format(nb_file2merge))
    logging.info("Fraction of samples to extract : {}".format(frac_samples))

    X_fields = ['dist0005', 'dist0100', 'dist1000', 'hand0005',
        'hand0100', 'hand1000', 'slope',
        'elevation', 'wtd', 'landsat_1', 'landsat_2',
        'landsat_3', 'landsat_4', 'landsat_5', 'landsat_6',
        'landsat_7', 'NDVI']
    # Après reflexion, j'ai enlevé les deux colonnes latS et lonS qui de mon point de vue ne doivent
    # pas être utilisées pour la prédiction de la présence de tourbière

    X.columns = X_fields    
    y_fields = ['peatland']
    y.columns = y_fields

    if normalize:
        logger.info("Normalizing the data...")
        fields_to_transform = [ 'dist0005', 'dist0100', 'dist1000', 'hand0005',
        'hand0005', 'hand0100', 'hand1000', 'slope', 'wtd',
        'landsat_1', 'landsat_2', 'landsat_3', 'landsat_4',
        'landsat_7', 'NDVI']
        X = peatmat_data_proc.normalize_data(X, fields_to_transform)

    # Split the data into train, validation and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

    logger.info("Data splitted into train, validation and test datasets")
    logger.info(f"Train dataset size: {len(X_train)}")
    logger.info(f"Validation dataset size:sub_sampled_data =  {len(X_val)}")
    logger.info(f"Test dataset size: {len(X_test)}")

    # Define model parameters
    input_size = list(X_train.shape)[1]
    output_size = list(y_train.shape)[1] if len(y_train.shape) > 1 else 1

    # Convert to tensors
    X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val.to_numpy(), dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

    # Create TensorDataset
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    # Create DataLoader
    train_loader = Data.DataLoader(train_dataset, batch_size=128, shuffle=True)
    validate_loader = Data.DataLoader(val_dataset, batch_size=128, shuffle=False)
    test_loader = Data.DataLoader(test_dataset, batch_size=128, shuffle=False)

    # Free some memories
    del X_train, X_val, X_test, y_train, y_val, y_test
    del X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor, X_test_tensor, y_test_tensor

    # Define the model
    model = PeatNet(input_size, output_size).to(device)

    logger.info(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")
    logger.debug(model)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate, weight_decay=1e-5)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

    logger.info("Training the model...")
    total_time = train_model(model, train_loader, validate_loader, 
                            criterion, optimizer, num_epochs=num_epochs, device=device)


    train_model(model, train_loader, validate_loader, criterion, 
                optimizer, num_epochs=10, device='cuda', scheduler=scheduler)

    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    fullname_model = save_model(model, model_dir, current_time)
    logger.info(f"Model saved to {fullname_model}")

    logger.info("Testing the model...")
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in tqdm(validate_loader, desc='Final Validation', leave=False):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(validate_loader)
    logger.info(f"Validation loss: {avg_val_loss:.4f}")

    if carbon_estimation:
        carbon_footprint_calculator = CarbonFootprintCalculator(device)
        carbon_logger = CarbonFootprintLogger(carbon_log_dir, carbon_log_file)
        end, total_energy_kwh, total_carbon_footprint = carbon_footprint_calculator.calculate(start)
        carbon_logger.log_carbon_footprint(end, total_energy_kwh, total_carbon_footprint)
        
    logger.info("End of the script")


## Test of the program 



In [2]:
data_dir = "/home/gsainton/CALER/PEATMAP/1_NN_training/training_data" if os.uname().nodename == 'ares6' else "/data/gsainton/PEATLAND_DATA"

frac_samples = 0.10
peatmat_data_proc = PeatNetDataProc(data_dir=data_dir, frac_samples=frac_samples)

nb_file2merge = 5

peatmat_data_proc.set_list_rdn_files(nb_file2merge)
sub_sampled_data = peatmat_data_proc.get_list_rdn_files()

def get_lat_lon_from_filename(filename):
    
    lat_list = []
    lon_list = []

    filename = filename.split("_")[-1]
    # remove .mat extension
    filename = filename.split(".")[0]
    # find "n" 
    n = filename.find("n")
    s = filename.find("s")
    e = filename.find("e")
    w = filename.find("w")
    
    if n != -1:
        if e != -1:
            lat = int(filename[n+1:e])
            lon = int(filename[e+1:])
        else :
            lat = int(filename[n+1:w])
            lon = int(filename[w+1:])*-1
    else:
        if e != -1:
            lat = int(filename[s+1:e])*-1
            lon = int(filename[e+1:])
        else :
            lat = int(filename[s+1:w])*-1
            lon = int(filename[w+1:])*-1
    return lat, lon


def get_list_lat_lon_from_filename(filename):

    lat_list = []
    lon_list = []

    for f in filename:
        lat, lon = get_lat_lon_from_filename(f)
        lat_list.append(lat)
        lon_list.append(lon)

    return pd.DataFrame({"lat": lat_list, "lon": lon_list, "filename": filename})

tiles_catalog = get_list_lat_lon_from_filename(peatmat_data_proc.list_all_files)

display(tiles_catalog)



Unnamed: 0,lat,lon,filename
0,55,-95,trainingData_n55w95.mat
1,50,-95,trainingData_n50w95.mat
2,50,-100,trainingData_n50w100.mat
3,55,-100,trainingData_n55w100.mat
4,55,-105,trainingData_n55w105.mat
5,50,-105,trainingData_n50w105.mat
6,55,-90,trainingData_n55w90.mat
7,60,-110,trainingData_n60w110.mat
8,60,-90,trainingData_n60w90.mat
9,60,-95,trainingData_n60w95.mat


In [3]:
import folium # the folium library
world_map = folium.Map(zoom_start=10)

# Iterator for colors
colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'darkblue', 
          'darkgreen', 'cadetblue', 'darkpurple', 'white', 'pink', 'lightblue', 
          'lightgreen', 'black', 'lightgray', 'beige']

# loop over element of the dataframe
for i, row in tiles_catalog.iterrows():
    # Get the min and max values of the Latitude and Longitude
    min_lat = row['lat']
    max_lat = row['lat']+5
    min_lon = row['lon']
    max_lon = row['lon']+5
    # Get the color
    color = colors[i]
    # Create a rectangle
    folium.Rectangle(bounds=[[min_lat, min_lon], [max_lat, max_lon]], 
                    color='purple', fill=True, tooltip= row["filename"],
                    name=row["filename"]).add_to(world_map) 
world_map.fit_bounds(world_map.get_bounds())



tiles_rdn_catalog = get_list_lat_lon_from_filename(sub_sampled_data)


# Overplot rectangle of the sub_sampled_data on the map
for i, row in tiles_rdn_catalog.iterrows():
    # Get the min and max values of the Latitude and Longitude
    min_lat = row['lat']
    max_lat = row['lat']+5
    min_lon = row['lon']
    max_lon = row['lon']+5
    # Get the color
    color = colors[i]
    # Create a rectangle
    folium.Rectangle(bounds=[[min_lat, min_lon], [max_lat, max_lon]], 
                    color="red", fill=True, line_join="round", popup="Set for training",
                    dash_array="5, 5", fill_color="red", fill_opacity=0.5,
                    name=row["filename"]).add_to(world_map)

display(world_map)


## Applying the model to a complete image   

In [4]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from peatnet import *
from utils import *
from libCarbonFootprint import *

normalize = False            # Normalize the data
model_dir = "../peatnet_models"
input_size = 17
output_size = 1

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 

# Apply the model to a complete tile
fullname_model = "../peatnet_models/model_peatnet_20240710-155928.ckpt"
# Load the model
model = PeatNet(input_size, output_size).to(device)
model.load_state_dict(torch.load(fullname_model))
model.eval()

file_to_predict = np.random.choice(list(set(peatmat_data_proc.list_all_files) - set(sub_sampled_data)),1, replace=False)[0]
print(f"File to predict: {file_to_predict}")

# Load the data
X, y = peatmat_data_proc.load_dataset_mat(file_to_predict, outfmt="pandas", with_coord=True)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X_fields = ['dist0005', 'dist0100', 'dist1000', 'hand0005',
    'hand0100', 'hand1000', 'slope',
    'elevation', 'wtd', 'landsat_1', 'landsat_2',
    'landsat_3', 'landsat_4', 'landsat_5', 'landsat_6',
    'landsat_7', 'NDVI', "lat", "lon"]

X.columns = X_fields

# Keep lat and lon appart from X
lat = X["lat"]
lon = X["lon"]
X = X.drop(columns=["lat", "lon"])

y_fields = ['peatland']
# Normalize the data
y.columns = y_fields

fields_to_transform = [ 'dist0005', 'dist0100', 'dist1000', 'hand0005',
        'hand0005', 'hand0100', 'hand1000', 'slope', 'wtd',
        'landsat_1', 'landsat_2', 'landsat_3', 'landsat_4',
        'landsat_7', 'NDVI']

X = peatmat_data_proc.normalize_data(X, fields_to_transform)

# Convert to tensor
X_tensor = torch.tensor(X.to_numpy(), dtype=torch.float32).to(device)

# Apply the model
with torch.no_grad():
    y_pred = model(X_tensor)


File to predict: trainingData_n50w100.mat
X shape: (17490682, 19)
y shape: (17490682, 1)


In [5]:
import h5py
import numpy as np
import torch

def save_dataset_mat(output_dir, filename, X, y_pred_rs, outfmt="mat"):
    file_path = f"{output_dir}/{filename}"
    with h5py.File(file_path, 'w') as f:
        f.create_dataset('input', data=X.T)
        f.create_dataset('target', data=y_pred_rs.cpu().detach().numpy().T)
        f.close()
    print(f"Dataset saved to {file_path}")

# Assuming X and y_pred_rs are PyTorch tensors on GPU
# Example usage

filename = file_to_predict.split("/")[-1].split(".")[0]
outfilename = f"{filename}_pred.mat"
save_dataset_mat("../outputs", outfilename, X.to_numpy(), y_pred, outfmt="mat")

Dataset saved to ../outputs/trainingData_n50w100_pred.mat


In [6]:

# Convert to dataframe
y_pred_df = pd.DataFrame(y_pred.cpu().numpy(), columns=['peatland'])
y_pred_df.columns = y_fields
# Plot in two subplots the prediction and the ground truth
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].scatter(lat, lon, c=y['peatland'], cmap='viridis')
ax[0].set_title("Ground truth")
ax[0].set_xlabel("Latitude")
ax[0].set_ylabel("Longitude")
ax[0].grid(True)
# add the colorbar
cbar = plt.colorbar(ax[0].scatter(lat, lon, c=y['peatland'], cmap='viridis'))
# set limit of the colorbar
cbar.set_label('Peatland')

# Plot the prediction   
ax[1].scatter(lat, lon, c=y_pred_df['peatland'], cmap='viridis')
ax[1].set_title("Prediction")
ax[1].set_xlabel("Latitude")
ax[1].set_ylabel("Longitude")
ax[1].grid(True)
# add the colorbar
cbar = plt.colorbar(ax[1].scatter(lat, lon, c=y_pred_df['peatland'], cmap='viridis'))
cbar.set_label('Peatland')
plt.tight_layout()
plt.show()

# remove .mat extension from the filename
file_to_predict = file_to_predict.split(".")[0]
plt.savefig(f"prediction_ground_truth_{file_to_predict}.png")

AttributeError: 'Colorbar' object has no attribute 'clim'

Error in callback <function _draw_all_if_interactive at 0x707370bb2e80> (for post_execute), with arguments args (),kwargs {}:


KeyboardInterrupt: 