In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import numpy as np
import json
import os
from shapely import geometry
from sklearn import metrics

In [None]:
pd.DataFrame({"Adat neve": ["Vegetáció/területhasználat jellege", "Tengerszint feletti magasság", "Terület lejtése (fokban)", "Úthálózat"],
              "Adat típusa": ["kategorikus", "numerikus", "numerikus", "kategorikus"],
              "Felbontás": ["30m x 30m", "30m x 30m", "30m x 30m", "15m x 15m"]})

In [None]:
k_dict = {'Logisztikus regresszió': {'class_weights': {0: 1, 1: 1}, 'solver': 'newton-cholesky'},
'Random Forest': {'n_estimators': 250, 'min_samples_leaf': 500},
'XGBRegressor': {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}}

for key, val in k_dict.items():
    print(key)
    print('\t - ', val)
    print('\n')

## Topography variables

In [None]:
static_df = pd.read_csv('data/datasets/static_variables.csv', index_col=[0])

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle("LANDFIRE adathalmaz topografikus változók", fontsize=24)
ax = ax.flatten()
[_.set_axis_off() for _ in ax]

cm = plt.cm.get_cmap("terrain")

n = 100
a = 0.3
b = 1

new_cmap = colors.LinearSegmentedColormap.from_list(f'trunc({n},{a:.2f},{b:.2f})',
                                                    cm(np.linspace(a, b, n)))

sc = ax[0].scatter(static_df["lon"], static_df["lat"], c=static_df["ELEVATION"], s=0.1, cmap=new_cmap)

ax[0].scatter(static_df.loc[static_df["Water"] == 1, "lon"],
          static_df.loc[static_df["Water"] == 1, "lat"],
          c="blue", s=0.1)
ax[0].set_title("Tengerszint feletti magasság", fontsize=18)
cbar = plt.colorbar(sc, ax=ax[0])
plt.draw()
cbar.ax.set_yticklabels([s.get_text() + " m" for s in cbar.ax.get_yticklabels()])

sc = ax[1].scatter(static_df["lon"], static_df["lat"], c=static_df["SLOPE"], s=0.1, cmap="cividis")
ax[1].set_title("Lejtő dőlésszöge (°)", fontsize=18)
cbar = plt.colorbar(sc, ax=ax[1])
plt.draw()
cbar.ax.set_yticklabels([s.get_text() + "°" for s in cbar.ax.get_yticklabels()])

plt.savefig("data/output/landfire_plots.png")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle("LANDFIRE adathalmaz antropogén hatások", fontsize=24)
[k.set_axis_off() for k in ax]

road_df = pd.read_csv("data/input/ROADS/roads_data.csv", index_col=[0])

sc = ax[0].scatter(static_df["lon"], static_df["lat"], c=static_df["DISTANCE_FROM_ROADS"], s=0.1)
ax[0].plot(road_df["lon"].values, road_df["lat"].values,
           'o', markersize=0.36, c="grey", label="főút")
legend = ax[0].legend(loc="upper right", fontsize=14)
for i in range(1):
    legend.legendHandles[i]._legmarker.set_markersize(12)
ax[0].set_title("Pixelek távolsága az utaktól", fontsize=18)
cbar = plt.colorbar(sc, ax=ax[0])
plt.draw()
cbar.ax.set_yticklabels([s.get_text() + " km" for s in cbar.ax.get_yticklabels()])

sc = ax[1].scatter(static_df["lon"], latlon_df["lat"], c=static_df["DISTANCE_FROM_URBAN_AREA"], s=0.1)
ax[1].plot(road_df["lon"].values, road_df["lat"].values, 'o', markersize=0.36, c="grey", label="főút")
ax[1].plot(static_df[static_df["Urban"] == 1]["lon"].values,
           static_df[static_df["Urban"] == 1]["lat"].values,
           'o', markersize=0.36, c="red", label="lakott terület")
legend = ax[1].legend(loc="upper right", fontsize=14)
for i in range(2):
    legend.legendHandles[i]._legmarker.set_markersize(12)
ax[1].set_title("Pixelek távolsága a lakott területektől", fontsize=18)
cbar = plt.colorbar(sc, ax=ax[1])
plt.draw()
cbar.ax.set_yticklabels([s.get_text() + " km" for s in cbar.ax.get_yticklabels()])

plt.savefig("data/output/road_urban_data_plot.png")
plt.show()

## Landuse

In [None]:
in_tuples = {(x, y): 1 for x, y in latlon_df.values}

fuel_df = pd.read_csv("data/input/FUEL/coord_df.csv", index_col=[0])
fuel_df = fuel_df.loc[fuel_df.apply(lambda x: (x["lon"], x["lat"]) in in_tuples, axis=1)].reset_index(drop=True)

In [None]:
fuel_one_hot = fuel_df["val"].value_counts().to_dict().copy()

for key, val in fuel_df["val"].value_counts().to_dict().items():
    fuel_one_hot[key] = [fuel_num_map[key], val]
    
pd.DataFrame(fuel_one_hot, index=['területhasználat', 'rácspontok száma'])

In [None]:
fuel_num_map = {i: "FBFM növényzet" for i in range(13)}
fuel_num_map.update({91: "lakott terület", 92: "hó-/jégtakaró", 93: "mezőgazdasági", 98: "vízfelszín", 99: "kopár/sziklás"})

fuel_color_map = {i: "forestgreen" for i in range(13)}
fuel_color_map.update({91: "red", 92: "lightgrey", 93: "yellow", 98: "blue", 99: "grey"})

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax.set_axis_off()

for val in [[*range(13)], [91], [92], [93], [98], [99]]:
    lons = fuel_df[fuel_df["val"].isin(val)].loc[:, "lon"].values
    lats = fuel_df[fuel_df["val"].isin(val)].loc[:, "lat"].values
    sc = ax.plot(lons, lats, 'o', c=fuel_color_map[val[0]], markersize=0.36, label=fuel_num_map[val[0]])
legend = ax.legend(loc="upper right", fontsize=18)
for i in range(6):
    legend.legendHandles[i]._legmarker.set_markersize(12)
ax.set_title("LANDFIRE adathalmaz \n vegetáció / területhasználat", fontsize=24)
plt.savefig("data/output/fuel_data_plot.png")
plt.show()

## PDSI plot

In [None]:
kriged_data = pd.read_csv("data/input/DROUGHT/monthly_data/2020-08.csv", index_col=[0])
kriged_data

kriged_temp_df = pd.read_csv("data/input/WEATHER/TAVG/monthly_data/2020-8.csv", index_col=[0])
kriged_temp_df

In [None]:
cm = plt.cm.get_cmap("Spectral")

n = 100
a = 0
b = 0.6

new_cmap = colors.LinearSegmentedColormap.from_list(f'trunc({n},{a:.2f},{b:.2f})',
                                                    cm(np.linspace(a, b, n)))

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 10))
[a.set_axis_off() for a in ax]


sc = ax[0].scatter(kriged_temp_df["lon"], kriged_temp_df["lat"], c=kriged_temp_df["kriged_val"], s=0.1, cmap='YlOrRd')
ax[0].plot(temp_df["LONGITUDE"].values,
           temp_df["LATITUDE"].values,
           marker='x', linestyle='', c='black', label="mérőállomások")
ax[0].set_title("Átlaghőmérséklet", fontsize=24)
cbar = plt.colorbar(sc, ax=ax[0])
legend = ax[0].legend(fontsize=14)
for i in range(1):
    legend.legendHandles[i]._legmarker.set_markersize(8)
df = pd.DataFrame({"lon": xx, "lat": yy})
df = df.dropna().reset_index(drop=True)
mask = df.apply(lambda x: CA.contains(geometry.Point(x["lon"], x["lat"])), axis=1).values
    
df = df.loc[mask].reset_index(drop=True)

sc = ax[1].scatter(kriged_data["lon"], kriged_data["lat"], c=kriged_data["kriged_val"], s=0.1, cmap=new_cmap)
ax[1].plot(df["lon"].values, df["lat"].values,
           marker='x', linestyle='', c="black", label="adatokkal rendelkező \n rácspontok")
ax[1].set_title("Átlag Palmer szárazságindex", fontsize=24)
cbar = plt.colorbar(sc, ax=ax[1])
legend = ax[1].legend(loc='upper right', fontsize=14)
for i in range(1):
    legend.legendHandles[i]._legmarker.set_markersize(8)
# ax[1].plot(*CA.exterior.xy)

plt.savefig("data/output/mérőállomások_krigeléssel.png")
plt.show()

## Downscaling

In [None]:
validation_df = pd.read_csv('data/datasets/validation_2020.csv', index_col=[0])

In [None]:
sample_datasets = {i: pd.read_csv(f"data/datasets/final_datasets/{i}.csv", index_col=[0]) for i in range(20)}

In [None]:
y_valid = validation_df["TARGET"]
X_valid = validation_df[[col for col in validation_df if col != "TARGET"]]

optimal_scale_params = {'logistic': [], 'random_forest': [], 'xgboost': []}

for i, train_dataset in sample_datasets.items():
    
    y_train = train_dataset["TARGET"].copy()
    X_train = train_dataset[[col for col in train_dataset if col != "TARGET"]]
    
    model = LogisticRegression(class_weight={0: 1, 1: 1}, solver="newton-cholesky")
    
    model.fit(X_train, y_train)
    
    roc_auc, profile_diff, output_pairs = scores_logistic(model, X_valid, y_valid)
    
    actual = np.array(output_pairs)[:, 0]
    
    preds = np.array(output_pairs)[:, 1]
    
    min_diff = np.sum((preds - actual) ** 2)
    opt_c = 1

    for d in np.linspace(0.0001, 2, 20000):
        if min_diff > np.sum((d*preds - actual) ** 2):
            min_diff = np.sum((d*preds - actual) ** 2)
            opt_c = d
    
    optimal_scale_params['logistic'].append(opt_c)
    
    fig, ax = plt.subplots(figsize=(7.5, 7.5))
    
    ax.set_title("Predikció átskálázás", fontsize=20)
    
    ax.scatter([*range(1,13)], opt_c*preds, c='red', label='logisztikus regresszió')
    ax.set_xlabel("hónap index")
    ax.set_ylabel("valószínűség-profil")
    
    ##################################################
    
    model = best_models['random_forest'][i]
    
    model = RandomForestRegressor(n_estimators=250, min_samples_leaf=500)
    
    model.fit(X_train, y_train)
    
    roc_auc, profile_diff, output_pairs = scores_tree(model, X_valid, y_valid)
    
    actual = np.array(output_pairs)[:, 0]
    
    preds = np.array(output_pairs)[:, 1]
    
    min_diff = np.sum((preds - actual) ** 2)
    opt_c = 1

    for d in np.linspace(0.0001, 1, 10000):
        if min_diff > np.sum((d*preds - actual) ** 2):
            min_diff = np.sum((d*preds - actual) ** 2)
            opt_c = d
    
    ax.scatter([*range(1,13)], opt_c*preds, c='green', label='Random Forest')
    
    optimal_scale_params['random_forest'].append(opt_c)
    
    ########################################################
    
    model = XGBRegressor(objective='binary:logistic', n_estimators=100, max_depth=5, learning_rate=0.1)
    
    model.fit(X_train, y_train)
    
    roc_auc, profile_diff, output_pairs = scores_tree(model, X_valid, y_valid)

    actual = np.array(output_pairs)[:, 0]
    
    preds = np.array(output_pairs)[:, 1]
    
    min_diff = np.sum((preds - actual) ** 2)
    opt_c = 1

    for d in np.linspace(0.0001, 1, 10000):
        if min_diff > np.sum((d*preds - actual) ** 2):
            min_diff = np.sum((d*preds - actual) ** 2)
            opt_c = d
    
    ax.scatter([*range(1,13)], opt_c*preds, c='black', label='XGBRegressor')
    
    optimal_scale_params['xgboost'].append(opt_c)
    
    ax.scatter([*range(1, 13)], actual, c='orange', s=75, label='valódi értékek', marker='x')
    ax.set_xticks([*range(1, 13)])
    ax.set_xlim(0, 13)
    ax.plot([0, 13], [0, 0], color='black')
    
    plt.legend(loc='upper left')
    plt.savefig('data/output/downscale.png')
    plt.show()
    
    break

## XGBoost variable importance

In [None]:
best_models = {"logistic": [], "random_forest": [], "xgboost": []}

for i in range(20):
    best_models["logistic"].append(joblib.load(f"best_models/logreg_{i}.model"))
    best_models['random_forest'].append(joblib.load(f"best_models/rf_{i}.model"))
    best_models["xgboost"].append(joblib.load(f"best_models/xgboost_{i}.model"))

In [None]:
xgb_features = best_models['xgboost'][0].feature_importances_

xgb_f_names = best_models['xgboost'][0].feature_names_in_

k = 20

for i in range(1, k):
    xgb_features += best_models['xgboost'][i].feature_importances_
    
xgb_features *= (1/k)

In [None]:
static_vars = []

for i, name in enumerate(xgb_f_names):
    if 'prev' in name:
        static_vars.append(i)
        
        
static_importance = 0
for i in static_vars:
    static_importance += xgb_features[i]
    
1 - static_importance

In [None]:
indices = np.argsort(xgb_features)

fig, ax = plt.subplots(figsize=(10, 5))

ax.barh(range(15), xgb_features[indices[-15:]], align='center')
ax.set_yticks(range(15), [xgb_f_names[i] for i in indices[-15:]], fontsize=10)
plt.savefig("data/output/xgboost_feature_importance.png")
plt.show()

## Projection vs 2020-2021

In [None]:
month_dict = {1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31}

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as colors

cm = plt.cm.get_cmap("YlOrRd_r")

n = 100
a = 0.2
b = 0.9

new_cmap = colors.LinearSegmentedColormap.from_list(f'trunc({n},{a:.2f},{b:.2f})',
                                                   cm(np.linspace(a, b, n)))

static_df = pd.read_csv("data/datasets/static_variables.csv", index_col=[0])

lons = static_df['lon'].values
lats = static_df['lat'].values

df = pd.read_csv("data/datasets/raw_datasets/2020-6.csv", index_col=[0])

drought_vals = pd.read_csv(f"data/input/DROUGHT/SSP5/monthly_data/2034-6.csv")["kriged_val"].values

fig, ax = plt.subplots(1, 2, figsize=(16, 8))

ax[0].set_axis_off()
hist_sc = ax[0].scatter(list(lons) + [0, 0],
                        list(lats) + [0, 0],
                        c=list(df['PDSI_prev1']) + [1.2, -5.5],
                        s=0.1, cmap=new_cmap)

ax[0].set_xlim(-124.997, -113.623)
ax[0].set_ylim(32.067, 42.473)

ax[0].set_title('2020 augusztus PDSI', fontsize=20)

plt.colorbar(hist_sc, ax=ax[0])

ax[1].set_axis_off()
future_sc = ax[1].scatter(list(lons) + [0, 0],
                        list(lats) + [0, 0],
                        c=list(drought_vals) + [1.2, -5.5],
                        s=0.1, cmap=new_cmap)

ax[1].set_xlim(-124.997, -113.623)
ax[1].set_ylim(32.067, 42.473)

ax[1].set_title('2034 augusztus PDSI\nprojekció', fontsize=20)
plt.colorbar(future_sc, ax=ax[1])

plt.savefig('data/output/drought_future_krig_comparison.png')
plt.show()

In [None]:
cm = plt.cm.get_cmap("GnBu")

n = 100
a = 0.4
b = 0.9

new_cmap = colors.LinearSegmentedColormap.from_list(f'trunc({n},{a:.2f},{b:.2f})',
                                                   cm(np.linspace(a, b, n)))

fig, ax = plt.subplots(1, 2, figsize=(16, 8))

ax[0].set_axis_off()
hist_sc = ax[0].scatter(list(lons) + [0, 0],
                        list(lats) + [0, 0],
                        c=list(df['PRCP_prev1']) + [0, 0],
                        s=0.1, cmap=new_cmap)

ax[0].set_xlim(-124.997, -113.623)
ax[0].set_ylim(32.067, 42.473)

ax[0].set_title('2020 augusztus csapadékmennyiség', fontsize=20)

plt.colorbar(hist_sc, ax=ax[0])

ax[1].set_axis_off()
future_sc = ax[1].scatter(list(lons) + [0, 0],
                        list(lats) + [0, 0],
                        c=list(prcp_vals) + [0, 0],
                        s=0.1, cmap=new_cmap)

ax[1].set_title('2034 augusztus csapadékmennyiség\nprojekció', fontsize=20)

ax[1].set_xlim(-124.997, -113.623)
ax[1].set_ylim(32.067, 42.473)

plt.colorbar(future_sc, ax=ax[1])

plt.savefig('data/output/prcp_future_krig_comparison.png')
plt.show()