# Cross-val predict on seen data

# Prediction on unseen data

In [None]:
# Create empty target raster from shape with NaN outside of shape
from ltm.data import shapefile2raster
from pathlib import Path

shapefile_path = "../data/raw/Forst/Forst.shp"
target_path = f"../data/processed/generalization/Forst.tif"
Path(target_path).parent.mkdir(parents=True, exist_ok=True)

if not Path(target_path).exists():
    shapefile2raster(
        target_path,
        shapefile_path,
    )

In [None]:
# Get optimal composite values
import pandas as pd

df = pd.read_csv("../reports/reducer_composites.csv")
metric = "Root Mean Squared Error"

optimal_idx = df.groupby("Reducer")[metric].idxmin()
optimal_df = df.loc[optimal_idx]
optimal_df = optimal_df.set_index("Reducer")

reducers = [
    "median",
    "mean",
    "mode",
    "min",
    "max",
    "sampleVariance",
    "kendallsCorrelation",
    "skew",
    "kurtosis",
]

optimal_df = optimal_df.loc[reducers]
composite_dict = optimal_df["Composites"].to_dict()

composite_dict

In [None]:
# List the optimal bands
from ltm.models import bands_from_importance

sentinel_bands, indices = bands_from_importance(
    "../reports/band_importance.csv"
)

sentinel_bands, indices

In [None]:
# Create composite
from ltm.data import sentinel_composite
from ltm.features import load_raster, interpolate_data, to_float32, save_raster
from datetime import datetime
from tqdm.notebook import tqdm
from time import sleep

# Define start year, starting at July 1st until June 30th of the next year
YEAR = 2020

# Create the composite folder
stem = Path(target_path).stem
composite_folder = Path(target_path).parent / stem
composite_folder.mkdir(exist_ok=True)

# Handle each reducer separately
total_data = pd.DataFrame()
for reducer, num_composites in tqdm(composite_dict.items()):
    composite_path = f"{composite_folder}/{stem}_{reducer}_{num_composites}.tif"  # Create the composite if it does not exist
    while not Path(composite_path).exists():
        try:
            sentinel_composite(
                target_path,
                composite_path,
                time_window=(datetime(YEAR, 7, 1), datetime(YEAR + 1, 7, 1)),
                num_composites=num_composites,
                temporal_reducers=[reducer],
                indices=indices,
                sentinel_bands=sentinel_bands,
            )
        except ValueError:
            # sleep for five minutes
            sleep(300)

    # Combine into one raster
    data = load_raster(composite_path)
    data = interpolate_data(data)
    data.dropna(axis=1, inplace=True)  # Kendall's Correlation P-Value
    data = to_float32(data)
    total_data = pd.concat([total_data, data], axis=1)

# Save the total data
total_data_path = f"{composite_folder}/{stem}.tif"
if not Path(total_data_path).exists():
    save_raster(total_data, target_path, total_data_path)

# Load the data
data = load_raster(total_data_path)

In [None]:
# Train best model on labeled data of the study area
import dill
from ltm.features import drop_nan_rows

df = pd.read_csv("../reports/hyperparameter_tuning.csv", index_col=0)
best_model = df["Root Mean Squared Error"].idxmin()

with open(f"../models/{best_model}.pkl", "rb") as f:
    model = dill.load(f)

# Load the data
study_area_data = load_raster("../data/processed/data.tif")
study_area_target = load_raster("../data/processed/target.tif")
study_area_data, study_area_target = drop_nan_rows(study_area_data, study_area_target)

# Train the model
model.fit(study_area_data, study_area_target)

model

In [None]:
# Save prediction to target raster
xgb_pred = model.predict(data)

xgb_pred

In [None]:
import rasterio
import matplotlib.pyplot as plt

with rasterio.open(target_path) as src:
    shape = src.read().shape

xgb_reshaped = xgb_pred.reshape(shape)

plt.imshow(xgb_reshaped.transpose(1, 2, 0), cmap="viridis")

In [None]:
# Plot the prediction
import matplotlib.pyplot as plt
import scienceplots

plt.style.use("science")