# DEM Analysis
In this notebook DEM features are extracted and analyzed whether they stand in some relation to the clusters that were found in our AOI.

In [None]:
import sys
sys.path.append("./notebooks")
sys.path.append("../src")


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from data_processing.dem_processor import DEMProcessor
from scipy.stats import kruskal
from sklearn.preprocessing import RobustScaler
from mord import LogisticAT

## DEM-Feature Workflow

In [None]:
dp = DEMProcessor()

In [None]:
dem_feature_df = dp.extract_features_as_df()
dem_feature_df

In [None]:
dem_feature_df["uca"].quantile(0.975)

! ToDo explain that this was the reason for my stream treshold

## Visualizations of DEM Features

In [None]:
#Visualization settings

dem_features = {
    'height': {
        'vmin': None, 'vmax': None, 'title': "Height", 'cmap': 'cividis',
    },
    'slope_deg': {
        'vmin': None, 'vmax': None, 'title': "Slope Angle in Degree", 'cmap': 'viridis',
    },
    'aspect_deg': {
        'vmin': None, 'vmax': None, 'title': "Aspect Angle in Degree", 'cmap': 'twilight',
    },
    'northness': {
        'vmin': None, 'vmax': None, 'title': "Northness of Aspect", 'cmap': 'RdBu_r',
    },
    'eastness': {
        'vmin': None, 'vmax': None, 'title': "Eastness of Aspect", 'cmap': 'RdBu_r',
    },
    'tpi': {
        'vmin': None, 'vmax': None, 'title': "Topographic Position Index (Radius 250m)", 'cmap': 'RdYlGn',
    },
    'twi': {
        'vmin': None, 'vmax': None, 'title': "Topographic Wetness Index", 'cmap': 'viridis',
    },
    'uca': {
        'vmin': None, 'vmax': 1e6, 'title': "Upstream Contributing Area", 'cmap': 'viridis',
    },
    'log_uca': {
        'vmin': None, 'vmax': None, 'title': "Upstream Contributing Area Log10", 'cmap': 'viridis',
    },
    'distance_to_stream': {
        'vmin': None, 'vmax': None, 'title': "Distance to the next stream", 'cmap': 'plasma',
    },
    'label': {
        'vmin': None, 'vmax': None, 'title': "Cluster label", 'cmap': 'viridis',
    },
}

In [None]:
features = list(dem_features.keys())
features.remove("label")

In [None]:
for feature in dem_features.keys():

    data_2d = dp.geometry_processor.reconstruct_2d(dem_feature_df[feature])
    
    fig, ax = plt.subplots(figsize=(10, 8))

    im = ax.imshow(data_2d, 
                   cmap=dem_features[feature]["cmap"], 
                   vmin=dem_features[feature]["vmin"], 
                   vmax=dem_features[feature]["vmax"])
    
    ax.set_title(dem_features[feature]["title"])

    cbar = fig.colorbar(im, ax=ax, shrink=0.5)
    cbar.set_label(feature)

    plt.tight_layout()

!ToDo Analyze the visualizations

#### Features per Cluster

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(20, 12), sharex=True)

for ax, col in zip(axes.flatten(), features):
    sns.violinplot(data=dem_feature_df, x="label", y=col, ax=ax)
    ax.set_title(dem_features[col]["title"])

plt.tight_layout()
plt.show()

!ToDo Write something about the Violin Plots and how there are no big differences

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(20, 12), sharex=True)

for ax, col in zip(axes.flatten(), features):
    sns.boxplot(data=dem_feature_df, x="label", y=col, ax=ax)
    ax.set_title(dem_features[col]["title"])

plt.tight_layout()
plt.show()

## Statistical Tests

#### Kruskal-Test

In [None]:
results = []

for feat in features:
    groups = [dem_feature_df[dem_feature_df['label'] == i][feat].dropna().values for i in range(4)]
    stat, p = kruskal(*groups)
    epsilon_sq = (stat - len(groups) + 1) / (len(dem_feature_df) - len(groups))
    
    results.append({'feature': feat, 'H_stat': stat, 'p_value': p, 'epsilon_sq': epsilon_sq})

results_df = pd.DataFrame(results).sort_values('epsilon_sq', ascending=False)
results_df

!ToDo Write something about how every of those is "significant" because of the 660k instances but that the best feature only explains 5% of the variance of the ranks of kruskal, which is quite low

#### Logistical Regression

In [None]:
health_mapping = {
    0: 3,
    1: 2,
    2: 1,
    3: 1
}

In [None]:
logistical_regression_df = dem_feature_df.copy(deep=True)
logistical_regression_df["label"] = logistical_regression_df["label"].replace(health_mapping)

X = logistical_regression_df.drop(columns=["label", "aspect_deg", "uca"])
y = logistical_regression_df["label"].astype(int)

scaler = RobustScaler()
X_norm = scaler.fit_transform(X=X)

logistic_model = LogisticAT()
logistic_model.fit(X=X_norm, y=y)

for feature, beta in zip(list(X.columns), logistic_model.coef_):
    print(feature + ": " + str(beta))

In [None]:
features = list(X.columns)
coefficients = np.ravel(logistic_model.coef_)

sorted_idx = np.argsort(np.abs(coefficients))[::-1]

features_sorted = np.array(features)[sorted_idx]
coefficients_sorted = coefficients[sorted_idx]

plt.figure()
plt.bar(features_sorted, coefficients_sorted)

plt.xticks(rotation=90)
plt.ylabel("Coefficient Value")
plt.title("Logistic Regression Coefficients")

plt.tight_layout()
plt.show()

!ToDo, Analysiere Koeffizienten West-Ost-Exposition relevanter ist als Nord-Süd.

In [None]:
proba = logistic_model.predict_proba(X_norm)
log_likelihood = np.sum(np.log(proba[np.arange(len(y)), y - 1]))

class_probs = np.array([np.mean(y == c) for c in [1, 2, 3]])
log_likelihood_null = np.sum(np.log(class_probs[y - 1]))

mcFadden_r2_score = 1 - (log_likelihood/log_likelihood_null)
mcFadden_r2_score

Das bedeutet nicht dass Topographie irrelevant ist — sondern dass die dominanten Treiber woanders liegen. Borkenkäfer, Trockenperioden, Bestandsalter sind wahrscheinlich die eigentlichen Ursachen, und Topographie moduliert das Risiko nur schwach.