## Modules

In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Get the parent directory of the current notebook
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../src"))

# Add the parent directory to sys.path
sys.path.insert(0, parent_dir)

from scripting import (
    load_composites, 
    load_dems,
    calculate_aspect_from_dems,
    calculate_slope_from_dems,
    load_points,
    load_lc_points,
    load_features,
    create_geodf_for_lc_map,
    logged_main
)

from generate_dataset import (
    generate_maps,
    extract_features_for_points,
    enhance_dataset,
    inspect_class_distribution,
    apply_erosion_and_report,
    visual_match_verification_l1,
    visual_match_verification_l2,
    plot_distance_histogram,
    verify_coordinates,
    convert_sav_to_csv
    )

import matplotlib.pyplot as plt
import dask
import dask.distributed
from pyproj import CRS
import plotly.express as px
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import yaml
import pandas as pd
import numpy as np
def read_yaml(file_path: str) -> dict:
    
    with open(file_path, 'r') as yaml_file: return yaml.safe_load(yaml_file)
    
def fix_paths_for_nb(input_dict, old_substring = "/home/hrlcuser/media", new_substring = "/media/datapart/lucazanolo"):
    
    return {
        key: (value.replace(old_substring, new_substring) if isinstance(value, str) else value)
        for key, value in input_dict.items()
    }
    
def plot_map(array : np.ndarray, title = ""):

    plt.figure(figsize=(10, 8))
    plt.imshow(array, cmap="viridis")
    plt.colorbar(label="Value")
    plt.title("Data Visualization")
    plt.xlabel("X Coordinate")
    plt.ylabel("Y Coordinate")
    plt.show()


## Parameters

In [None]:
parameters = fix_paths_for_nb(read_yaml("/home/lucazanolo/luca-zanolo/scripts/config_files/5.generate_dataset.yaml"))
points_dataset_name = os.path.basename(parameters['points_dataset_path']).split("_")[-1][:-4]
if parameters['dataset_type'] == 'enhanced':
    
    dataset_id = f"T{parameters['tile_id']}_f{parameters['features_date']}_{points_dataset_name}_{parameters['enhanced_type']}_{parameters['dataset_type']}"
    
    if parameters["apply_erosion"] == True:
        dataset_id = f"{dataset_id}_eroded"
        
elif parameters['dataset_type'] == 'fullLC':
    
    dataset_id = f"T{parameters['tile_id']}_f{parameters['features_date']}_{points_dataset_name}_{parameters['samples_per_class']}_{parameters['dataset_type']}"
    
    if parameters["apply_erosion"] == True:
        dataset_id = f"{dataset_id}_eroded"
        
elif parameters['dataset_type'] == 'std':
    dataset_id = f"T{parameters['tile_id']}_f{parameters['features_date']}_{points_dataset_name}_{parameters['dataset_type']}"

dataset_path = f"{parameters['output_path']}/{dataset_id}.csv"
features_path = f"{parameters['features_path']}/{parameters['features_date']}/*.tif"

os.makedirs(parameters['report_path'], exist_ok=True)
curr_reports_path = f"{parameters['report_path']}/{dataset_id}_reports"

os.makedirs(curr_reports_path, exist_ok=True)
lc_points_map_path = f"{curr_reports_path}/lcmap_points.html"
points_map_path = f"{curr_reports_path}/shapefile_points.html"
erosion_report_path = f"{curr_reports_path}/erosion_report.png"

os.makedirs(parameters['output_path'], exist_ok=True)

dataset_path, lc_points_map_path, points_map_path

## Convert existing pipeline datasets

In "official" script if this option is enabled no other datasets will be generated.

In [None]:
if parameters["convert_sav_dataset"] == True:
    dt = convert_sav_to_csv(
        sav_path=parameters["sav_dataset_path"],
        csv_path=f"{parameters['output_path']}/{os.path.basename(parameters['sav_dataset_path']).replace('.sav', '.csv')}"
    )

dt

## Load Composites and DEMs - Compute Aspect and Slope

In [None]:

with dask.distributed.Client(
    processes=False,
    threads_per_worker=(os.cpu_count() or 2),
) as client:
    
    print(f"Dask dashboard: {client.dashboard_link}")
    
    # Load pre computed features

    composites = load_composites(parameters["composites_path"], year=parameters["composites_year"], tile=parameters["tile_id"])

    dems = load_dems(parameters["dems_path"],year=parameters["dems_year"], tile=parameters["tile_id"])
    slope = calculate_slope_from_dems(dems.band_data)
    aspect = calculate_aspect_from_dems(dems.band_data)
    features = load_features(features_path)
    
    features_dataset = composites.assign({
                        "dems":dems.band_data,
                        "slopes":slope,
                        "aspects":aspect}).sel(tile=parameters["tile_id"])
    
    features_dataset = features_dataset.assign({
        f_name : feature.isel(time=0) for f_name, feature in features.items() # Only January for GLCM features kept. 
    })
    
    print(f"Dems:\n{dems}\n\n")
    print(f"Aspect:\n{aspect}\n\n")
    print(f"Slope:\n{slope}\n\n")
    print(f"Composites:\n{composites}\n\n")
    print(f"Features:\n{features_dataset}\n\n")    

## Load points from shape file

In [None]:

composites_crs = CRS.from_wkt(features_dataset.spatial_ref.attrs["crs_wkt"]).to_epsg()
points_df, labels_df, tile_geometry = load_points(parameters, features_dataset, composites_crs)
lccode2label = {row['LC_code'] : row['description'] for id, row in labels_df.iterrows()}
id2lccode = {row['internal_code'] : row['LC_code'] for id, row in labels_df.iterrows()}
points_df['class_id'] = points_df['class_id'].map(id2lccode)
points_df['class'].value_counts().sort_index().plot(kind='bar', figsize=(10, 6), title="Class Distribution in Points Dataset")

points_df, labels_df, tile_geometry

## Load LC map points and eventually apply erosion

In [None]:
with dask.distributed.Client(
    processes=False,
    threads_per_worker=(os.cpu_count() or 2),
) as client:
    
    print(f"Dask dashboard: {client.dashboard_link}")
    if parameters["dataset_type"] != 'std':
    
        print("Loading land cover map points ...")

        lc_map_xr = load_lc_points(parameters)

        if parameters["apply_erosion"] == True:
            
            print("Applying erosion ...")
            config = {
                'kernel': cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2)), #np.ones((3,3), np.uint8),         # Size of the erosion kernel
                'iterations': 1             # Number of iterations for erosion
            }

            classes_masks = apply_erosion_and_report(lc_map_xr, lccode2label, config, output_path = erosion_report_path)
        else:
            classes_masks = {id : (lc_map_xr == id).astype(np.uint8) for id in lccode2label.keys() if (lc_map_xr == id).mean() > 0}
            
        class_counts = points_df['class_id'].value_counts()
        max_class_samples = class_counts.max()
        avg_class_samples = int(class_counts.mean())
        
        if parameters["dataset_type"] != 'fullLC':
            # Determine target samples per class
            if parameters["enhanced_type"] == "max":
                parameters["samples_per_class"] = max_class_samples
            elif parameters["enhanced_type"] == "avg":
                parameters["samples_per_class"] = avg_class_samples
            else:
                raise ValueError("Invalid value for enhanced_type. Choose 'max' or 'avg'.")

        points_lc_df = create_geodf_for_lc_map(classes_masks, parameters, points_df, features_dataset, lccode2label, composites_crs)


In [None]:
with dask.distributed.Client(
    processes=False,
    threads_per_worker=(os.cpu_count() or 2),
) as client:
    
    print(f"Dask dashboard: {client.dashboard_link}")
    print("Selecting dataset points ...")
    points_lc_df = None
    
    if parameters["dataset_type"] != 'std':
            
        class_counts = points_df['class_id'].value_counts()
        max_class_samples = class_counts.max()
        avg_class_samples = int(class_counts.mean())
        
        if parameters["dataset_type"] != 'fullLC':
            
            # Determine target samples per class
            
            if parameters["enhanced_type"] == "max":
                parameters["samples_per_class"] = max_class_samples
            elif parameters["enhanced_type"] == "avg":
                parameters["samples_per_class"] = avg_class_samples
            else:
                raise ValueError("Invalid value for enhanced_type. Choose 'max' or 'avg'.")

        points_lc_df = create_geodf_for_lc_map(classes_masks, parameters, points_df, features_dataset, lccode2label, composites_crs)
    

In [None]:
print("Aligned shapefile points CRS:\n", points_df.crs)
if points_lc_df is not None:
    print("Loaded LC map points:\n", points_lc_df.crs)
    ids_shp = set(points_df['class_id'].unique())
    ids_lcmap = set(points_lc_df['class_id'].unique())
    print(f"Classes in shapefile: {ids_shp}")
    print(f"Classes in lcmap: {ids_lcmap}")

print("Features and composites CRS:\n", features_dataset.spatial_ref)

## Generate final .csv dataset

In [None]:
with dask.distributed.Client(
    processes=False,
    threads_per_worker=(os.cpu_count() or 2),
) as client:
    
    print(f"Dask dashboard: {client.dashboard_link}")
        
    if parameters["dataset_type"] == "enhanced":
        
        ids_shp = set(points_df['class_id'].unique())
        ids_lcmap = set(points_lc_df['class_id'].unique())
        common_ids = ids_shp & ids_lcmap
        points_df = points_df[points_df['class_id'].isin(common_ids)]
        points_lc_df = points_lc_df[points_lc_df['class_id'].isin(common_ids)]

        dataset = enhance_dataset(points_df, points_lc_df, target_class_col="class_id", samples_per_class=parameters["samples_per_class"])
        inspect_class_distribution(dataset, 
                                   class_column='class', 
                                   output_path = f"{curr_reports_path}/classes_distribution.png",
                                   title = f"Enhanced ({parameters['enhanced_type']}) dataset class distribution - Total points: {len(dataset)}")

    elif parameters["dataset_type"] == "fullLC":
        
        ids_shp = set(points_df['class_id'].unique())
        ids_lcmap = set(points_lc_df['class_id'].unique())
        common_ids = ids_shp & ids_lcmap
        points_df = points_df[points_df['class_id'].isin(common_ids)]
        points_lc_df = points_lc_df[points_lc_df['class_id'].isin(common_ids)]

        dataset = pd.concat([points_df, points_lc_df[["x","y","class_id","class","split"]]])
        inspect_class_distribution(dataset[dataset['split'] == 'train'], 
                                   class_column='class', 
                                   output_path = f"{curr_reports_path}/train_classes_distribution.png",
                                   title = f"FullLC dataset train split class distribution - Total points: {len(dataset[dataset['split'] == 'train'])}")
        
        inspect_class_distribution(dataset[dataset['split'] == 'test'], 
                                   class_column='class', 
                                   output_path = f"{curr_reports_path}/test_classes_distribution.png",
                                   title = f"FullLC dataset test/val split class distribution - Total points: {len(dataset[dataset['split'] == 'test'])}")

    elif parameters["dataset_type"] == "std":
        dataset = points_df[["x","y","class_id","class","split"]]
        inspect_class_distribution(dataset, 
                                   class_column='class', 
                                   output_path = f"{curr_reports_path}/classes_distribution.png",
                                   title = f"Standard dataset class distribution - Total points: {len(dataset)}")

    print("Generating final dataset ...")
    
dt = extract_features_for_points(features_dataset, dataset, dataset_path)



## Plot UniTN/(UnitTN + PoliMI) points

In [None]:
if parameters["dataset_type"] == "fullLC" or parameters["dataset_type"] == "enhanced":

        folium_map1 = generate_maps(points=points_lc_df, 
                                tile_geometry = tile_geometry,
                                        output_file=lc_points_map_path)

folium_map2 = generate_maps(points=points_df, 
                        tile_geometry = tile_geometry,
                                output_file=points_map_path)
        


## Inspect matching

In [None]:
verified_points = verify_coordinates(points_df.copy(), features_dataset)

print(verified_points[["geometry", "x", "y", "recalc_lon", "recalc_lat", "diff_lon", "diff_lat"]].head())
print(f"Average longitude difference: {verified_points['diff_lon'].mean()}")
print(f"Min/Max longitude difference: {verified_points['diff_lon'].min()} - {verified_points['diff_lon'].max()}")
print(f"Average latitude difference: {verified_points['diff_lat'].mean()}")
print(f"Min/Max latitude difference: {verified_points['diff_lat'].min()} - {verified_points['diff_lat'].max()}")

plot_distance_histogram(verified_points, save_path=curr_reports_path)
visual_match_verification_l1(verified_points, save_path=curr_reports_path)
visual_match_verification_l2(verified_points, save_path=curr_reports_path)