# Data fusion model training

This notebook is used to train the logistic regression models used to predict surface water.

The output of these are coeffiecients that should be used within the Earth Engine scritps to apply the logistic regressions.

In [None]:
from pathlib import Path
import datetime
import numpy as np
import xarray as xr
import rioxarray
import pandas as pd
import matplotlib.pyplot as plt
from scipy import ndimage

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn import metrics

## Read in Sen1Floods11 data

In [None]:
# change to the path where S1Floods11 data is located
parent_dir = Path("sen1floods11/v1.1/data")

In [None]:
flood_dir = parent_dir / "flood_events"
handlabel_dir = flood_dir / "HandLabeled"

In [None]:
s1_dir = handlabel_dir / "S1Hand"
s2_dir = handlabel_dir / "S2Hand"
label_dir = handlabel_dir / "LabelHand"

In [None]:
s1_files = s1_dir.glob("*.tif")

In [None]:
ids = ["_".join(f.name.split("_")[0:2]) for f in s1_files]

In [None]:
# specify pairs of S1, S2, and label images
img_pairs = [
    (s1_dir / f"{id}_S1Hand.tif", s2_dir / f"{id}_S2Hand.tif", label_dir / f"{id}_LabelHand.tif")
    for id in ids
]

In [None]:
# function to efficiently calculate the std deviation for a moving window
def window_stdev(X, window_size):
    r,c = X.shape
    X+=np.random.rand(r,c)*1e-6
    c1 = ndimage.uniform_filter(X, window_size, mode='reflect')
    c2 = ndimage.uniform_filter(X*X, window_size, mode='reflect')
    return np.sqrt(c2 - c1*c1)

In [None]:
# window size for the 
window_size = 9

# loop over all image pairs
for i,img_pair in enumerate(img_pairs):
    s1_img, s2_img, label_img = img_pair
    
    # read the data
    s1_da = rioxarray.open_rasterio(s1_img)
    s2_da = rioxarray.open_rasterio(s2_img)
    label_da = rioxarray.open_rasterio(label_img)

    s1_da["band"] = list(s1_da.attrs["long_name"])

    s2_da["band"] = list(s2_da.attrs["long_name"])
    s2_da = s2_da.isel(band=[1,2,3,7,11,12]) / 10000.

    s1_ds = s1_da.to_dataset(dim="band")
    s2_ds = s2_da.to_dataset(dim="band")
    label_ds = label_da[0,:,:].to_dataset(name="label")

    label_ds["x"] = s1_ds["x"]
    label_ds["y"] = s1_ds["y"]

    # merge all of the individual sensors into one dataset
    ds = xr.merge([s1_ds, s2_ds, label_ds])
    
    # calculate moving window statistics for SAR bands
    ds['VV_mean'] = (('y', 'x'), ndimage.uniform_filter(ds['VV'].values, size=window_size))
    ds['VH_mean'] = (('y', 'x'), ndimage.uniform_filter(ds['VH'].values, size=window_size))
    ds['VV_stdDev'] = (('y', 'x'), window_stdev(ds['VV'].values, window_size))
    ds['VH_stdDev'] = (('y', 'x'), window_stdev(ds['VH'].values, window_size))

    # convert to a dataframe 1-D structure
    df_ = ds.to_dataframe().dropna()
    
    # append dataframes
    if i == 0:
        df = df_
    else:
        df = pd.concat([df,df_],axis=0)


In [None]:
# check that the data is what is expected
df

In [None]:
# add a random column to dataframe and sort based on the random column
df["random"] = np.random.uniform(size=df.shape[0])
df.sort_values("random", inplace=True)

In [None]:
# optionally save dataframe to parquet file to prevent having to process again
df.to_parquet(handlabel_dir / f"s1_s2_label.parq",engine="pyarrow")

In [None]:
df = pd.read_parquet(handlabel_dir / "s1_s2_label.parq",engine="pyarrow")

In [None]:
# select rows where the label is valid (gt 0)
df_sel = df.loc[df["label"]>=0]

In [None]:
# calculate additional features used for model
df_sel["VVVH"] = (df_sel["VV"] / df_sel["VH"])

df_sel['mndwi'] = (df_sel['B3'] - df_sel['B11']) / (df_sel['B3'] + df_sel['B11'])
df_sel['ndvi'] = (df_sel['B8'] - df_sel['B4']) / (df_sel['B8'] + df_sel['B4'])

df_sel.dropna(inplace=True)

In [None]:
# get the number of samples per label class
hist,bins = np.histogram(df_sel['label'],bins=[-1,0,1,2])

In [None]:
# get extract out land and water samples
# subsample the land samples to equal the water samples
df_land = df_sel.query('label == 0').sample(hist[-1])
df_water = df_sel.query('label == 1')

# merge the two dataframes back together
df_all = pd.concat([df_land,df_water], axis=0)

In [None]:
df_all

## Model fitting

In [None]:
# generate a logistic regression object
clf = LogisticRegression(
    max_iter=1000,
    tol=1e-6,
    multi_class='ovr',
    n_jobs=-1
)

In [None]:
# extract out the labels 
y = df_all["label"] 

### SAR model

In [None]:
# extract SAR features
X_sar = df_all[["VV", "VH", 'VV_mean','VV_stdDev','VH_mean','VH_stdDev', 'VVVH']]

In [None]:
# split data into training and testing
X_train_sar, X_test_sar, y_train_sar, y_test_sar = train_test_split(X_sar, y, test_size=0.33, random_state=33)

In [None]:
# KFold object
skf = StratifiedKFold(n_splits=5)

In [None]:
sar_kfold_training = dict()

# loop through the KFolds
# train and assess accuracy per iteration
for k, (train, test) in enumerate(skf.split(X_train_sar, y_train_sar)):
    clf_fitted = clf.fit(X_train_sar.iloc[train],y_train_sar.iloc[train])
    acc = clf_fitted.score(X_train_sar.iloc[test],y_train_sar.iloc[test])
        
    sar_kfold_training[k] = dict(model=clf_fitted, score=acc,training_idx=train, testing_idx=test)

In [None]:
sar_kfold_training

In [None]:
# determine the iteration with best score
scores = np.array([sar_kfold_training[i]['score'] for i in range(5)])
best_idx = np.argmax(scores)

In [None]:
# extract out the best model
sar_best_model = sar_kfold_training[best_idx]['model']

In [None]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

In [None]:
# predict using the best model on the hold-out testing data
y_pred_sar = sar_best_model.predict(X_test_sar)


In [None]:
print("Accuracy:",metrics.accuracy_score(y_test_sar,y_pred_sar))
print("F1-score:",metrics.f1_score(y_test_sar,y_pred_sar,average='weighted'))
cm = metrics.confusion_matrix(y_test_sar,y_pred_sar,normalize="true")

In [None]:
# confusion matrix
cm

In [None]:
sar_best_model.coef_[

In [None]:
# print the coefficients
sar_coeffs = {col:sar_best_model.coef_[0,i] for i,col in enumerate(X_sar.columns)}
sar_coeffs['constant'] = sar_best_model.intercept_[0]

sar_coeffs

### Optical model

In [None]:
# extract optical sensor features
X_opt = df_all[["B2","B3","B4","B8","B11","B12",'mndwi']]

In [None]:
# split data into training and testing
X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(X_opt, y, test_size=0.33, random_state=33)

In [None]:
opt_kfold_training =dict()

# loop through the KFolds
# train and assess accuracy per iteration
for k, (train, test) in enumerate(skf.split(X_train_opt, y_train_opt)):
    # print(train)
    clf_fitted = clf.fit(X_train_opt.iloc[train],y_train_opt.iloc[train])
    acc = clf_fitted.score(X_train_opt.iloc[test],y_train_opt.iloc[test])
    
#     y_pred_sar = rf_sar.predict(X_test_sar)
    
    opt_kfold_training[k] = dict(model=clf_fitted, score=acc,training_idx=train, testing_idx=test)

In [None]:
opt_kfold_training

In [None]:
# determine the iteration with best score
scores = np.array([opt_kfold_training[i]['score'] for i in range(5)])
best_idx = np.argmax(scores)

In [None]:
# extract out the best model
opt_best_model = opt_kfold_training[best_idx]['model']

In [None]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

In [None]:
# predict using the best model on the hold-out testing data
y_pred_opt = opt_best_model.predict(X_test_opt)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test_opt,y_pred_opt))
print("F1-score:",metrics.f1_score(y_test_opt,y_pred_opt,average='weighted'))
cm = metrics.confusion_matrix(y_test_opt,y_pred_opt,normalize="true")

In [None]:
#confusion matrix
cm

In [None]:
# print the coefficients
opt_coeffs = {col:opt_best_model.coef_[0,i] for i,col in enumerate(X_opt.columns)}
opt_coeffs['constant'] = opt_best_model.intercept_[0]

opt_coeffs