# Data Augmentation

## Bar Plot of RMSEs

In [None]:
from ltm.features import load_raster
from ltm.models import bands_from_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.dummy import DummyRegressor
from pathlib import Path
import numpy as np

report_path = "../reports/generalization.csv"

if not Path(report_path).exists():
    experiment_0_path = "../data/processed/ground_truth/data_2A.tif"
    experiment_1_path = "../data/processed/band_importance/data.tif"
    experiment_2_path = "../data/processed/data.tif"
    target_path = "../data/processed/target.tif"

    experiment_0 = load_raster(experiment_0_path)
    experiment_1 = load_raster(experiment_1_path)
    experiment_2 = load_raster(experiment_2_path)
    target = load_raster(target_path)

    sentinel_bands, index_bands = bands_from_importance(
        "../reports/band_importance.csv"
    )
    bands = [f"1 {band} mean" for band in sentinel_bands + index_bands]
    experiment_1 = experiment_1[bands]

    mask = target.notna()
    experiment_0 = experiment_0[mask]
    experiment_1 = experiment_1[mask]
    experiment_2 = experiment_2[mask]
    target = target[mask]

    dummy_data = np.zeros_like(target).reshape(-1, 1)
    dummy = DummyRegressor(strategy="mean")
    scoring = "neg_root_mean_squared_error"
    dummy_score = -cross_val_score(
        dummy, dummy_data, target, scoring=scoring, n_jobs=-1
    ).mean()

    rf = RandomForestRegressor(n_jobs=-1, random_state=42)
    rmse_0 = -cross_val_score(
        rf, experiment_0, target, scoring=scoring, n_jobs=-1
    ).mean()
    rmse_1 = -cross_val_score(
        rf, experiment_1, target, scoring=scoring, n_jobs=-1
    ).mean()
    rmse_2 = -cross_val_score(
        rf, experiment_2, target, scoring=scoring, n_jobs=-1
    ).mean()
    ht_df = pd.read_csv("../reports/hyperparameter_tuning.csv")
    rmse_3 = ht_df["Root Mean Squared Error"].min()

    df = pd.DataFrame(
        {
            "Experiment": [
                "Dummy Regressor",
                "Baseline",
                "Band Importance",
                "Compositing",
                "Hyperparameter Tuning",
            ],
            "Root Mean Squared Error": [dummy_score, rmse_0, rmse_1, rmse_2, rmse_3],
        }
    )

    df.to_csv(report_path, index=False)
else:
    df = pd.read_csv(report_path)

In [None]:
# Bar plot of RMSE of every experiment
import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots

plt.style.use("science")

plt.plot([0, 1, 2, 3], df["Root Mean Squared Error"][1:], marker="o")
plt.axhline(df["Root Mean Squared Error"][0], color="red", linestyle="--")
plt.xlabel("Experiment")
plt.ylabel("RMSE")
plt.title("Generalization of Experiments")
plt.ylim(0, 0.5)
plt.show()

## Regular Generalization

Plan: Predict on 2018 with regular model (fitted on 2017). Then notice that predictions are garbage -> train model on either 2017 and 2019 or 2017-2023 (without 2018)

In [None]:
# Create empty target raster from shape with NaN outside of shape
from ltm.data import shapefile2raster
from pathlib import Path

name = "Freisinger Forst"
shapefile_path = f"../data/raw/{name}/{name}.shp"
target_path = f"../data/processed/generalization/{name}.tif"
year = 2018
batch_size = 25  # 25 for Freisinger Forst, 100 for Peterfecking, 200 for Traunstein, None for Brunnstube

Path(target_path).parent.mkdir(parents=True, exist_ok=True)
if not Path(target_path).exists():
    shapefile2raster(
        shapefile_path=shapefile_path,
        raster_path=target_path,
    )

In [None]:
# Create data for the target area
from ltm.models import create_data

stem = Path(target_path).stem
data_folder = str(Path(target_path).parent / stem)
create_data(year, target_path, data_folder, batch_size=batch_size)

In [None]:
# Predict with best model on new data
import dill

df = pd.read_csv("../reports/hyperparameter_tuning.csv", index_col=0)
best_model = df["Root Mean Squared Error"].idxmin()

with open(f"../models/{best_model}.pkl", "rb") as f:
    model = dill.load(f)

data_path = Path(data_folder) / f"{year}/{stem}.tif"
data = load_raster(str(data_path))

xgb_pred = model.predict(data)

# NEXT

In [None]:
from ltm.data import download_dlt_2018
from rasterio.plot import show
import rasterio

download_dlt_2018("../data/processed/generalization/Freisinger Forst.tif", "tmp.tif")

with rasterio.open("tmp.tif") as src:
    show(src, interpolation="nearest", cmap="viridis")

In [None]:
from ltm.features import load_raster

freising = load_raster("tmp.tif")
prediction = load_raster("../data/processed/generalization/Freisinger Forst.tif")

mask = prediction.notna() & freising.notna() & (freising != 240)
freising = freising[mask]
prediction = prediction[mask]

# Plot the violin plot
import seaborn as sns
import matplotlib.pyplot as plt

# import scienceplots

# plt.style.use("science")

ax = sns.violinplot(x=freising, y=prediction, inner="quart")
ax.set_ylim(0, 1)
plt.show()

In [None]:
# Create data for each year
target_path = "../data/processed/target.tif"
data_folder = "../data/processed/generalization/data/"
years = [2017, 2018, 2019, 2020, 2021, 2022]  # , 2023]

for year in tqdm(years, desc="Years"):
    create_data(year, target_path, data_folder)

# Combine all data into one dataframe
print("Combining data...")
total_data = pd.DataFrame()
for year in tqdm(years, desc="Years"):
    stem = Path(data_folder).stem
    data_path = Path(data_folder) / f"{year}/{stem}.tif"
    data = load_raster(str(data_path))
    total_data = pd.concat([total_data, data])

# Create target data
target = load_raster(target_path)
total_target = pd.concat([target] * len(years))

# Drop rows with NaN label
data, target = (
    total_data[total_target.notna()],
    total_target[total_target.notna()],
)

In [None]:
# Train best model on new data of the study area
import dill
from sklearn.base import clone

df = pd.read_csv("../reports/hyperparameter_tuning.csv", index_col=0)
best_model = df["Root Mean Squared Error"].idxmin()

with open(f"../models/{best_model}.pkl", "rb") as f:
    model = dill.load(f)

# Clone model in case warm_start=True and fit on new data
model = clone(model)
model.fit(data, target)

model

# Cross-val predict on seen data

# Prediction on unseen data

In [None]:
# Save the prediction by overwriting the shape raster
import numpy as np
import rasterio

with rasterio.open(target_path) as src:
    profile = src.profile
    shape = src.read().shape
    nan_mask = np.isnan(src.read())

xgb_reshaped = xgb_pred.reshape(shape)

with rasterio.open(target_path, "w", **profile) as dst:
    xgb_reshaped[nan_mask] = np.nan
    dst.write(xgb_reshaped)
    dst.descriptions = ("Conifer Proportion",)

In [None]:
# Use science style
import matplotlib.pyplot as plt
import scienceplots

%config InlineBackend.figure_format = 'svg'

plt.style.use("science")

In [None]:
from matplotlib.colors import Normalize
import matplotlib.pyplot as plt

norm = Normalize(vmin=0, vmax=1)
plt.imshow(
    xgb_reshaped.transpose(1, 2, 0),
    cmap="viridis",
    norm=norm,
    interpolation="nearest",
)
plt.colorbar(shrink=0.8)
plt.axis("off")

plt.show()

In [None]:
import seaborn as sns
import matplotlib.dates as mdates

ax = sns.kdeplot(xgb_reshaped.flatten())

plt.show()