# Data fusion validation figures

This notebook is used to create the figures for validation.

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import metrics
from pathlib import Path
import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
# directory where the validation samples were downloaded
# change to what is needed
datadir = Path('PATH-TO-DATA')

In [None]:
# read in the CSV
df = pd.read_csv(datadir / 'all_water_val.csv')

In [None]:
df

In [None]:
# function to calculate multiple accuracy metrics from a dataframe
def accurracy_assessment(df):
    # get the confusion matrix
    cm = metrics.confusion_matrix(df['wat'],df['water_f'],normalize='true')
    acc = metrics.accuracy_score(df['wat'],df['water_f'])
    
    # get the different accuracy metrics from the confusion matrix
    tn, fp, fn, tp = cm.ravel()
    far = fp / (fp + tn)
    pod = tp / (tp + fn)
    csi = tn / (tn + fn + fp)
    
    return dict(
        acc = acc,
        tn = tn,
        fp = fp,
        fn = fn,
        tp = tp,
        far = far,
        pod = pod,
        csi = csi
    )

# function to recreate a confusion matrix from the different components
def cm_from_components(components):
    tn, fp, fn, tp = components
    cm = [[tn, fp],
          [fn, tp]]
    
    return cm

## Accuracy for all samples

In [None]:
# get total accuracy assessment for all samples
all_results = accurracy_assessment(df)

In [None]:
all_results

In [None]:
all_cm = cm_from_components(list(all_results.values())[1:5])

In [None]:
# plot the confusion matrix for all samples
ax = sns.heatmap(all_cm, vmin=0, vmax=1, annot=True, fmt=".2f", cmap="YlGnBu", cbar_kws={'label': ''})
ax.set_yticklabels(['No Water','Water'])
ax.set_xticklabels(['No Water','Water'])

ax.set_ylabel('True')
ax.set_xlabel('Estimated')
plt.savefig('confusion_matrix.png',bbox_inches='tight',dpi=200)
plt.show()

## Accuracy for each region

In [None]:
# loop through each region and get the accuracy metrics
regions = ['Colombia','Gabon','Mexico','Zambia','Cambodia','Myanmar']

region_results = []

for region in regions:
    region_df = df.loc[df['region'] == region]
    region_acc = accurracy_assessment(region_df)
    region_acc['region'] = region
    region_acc['n_records'] = region_df.shape[0]
    region_results.append(region_acc)
    
region_acc_df = pd.DataFrame(region_results).set_index('region')

In [None]:
region_acc_df

## Accuracy in time

In [None]:
# loop over each region and month to get the accuracy
months = range(1,13)

region_dfs = []
for region in regions:
    region_df = df.loc[df['region'] == region]
    month_results = []
    for month in months :
        month_df = region_df.loc[df['month_ID'] == month]
        if month_df.shape[0] > 0:
            month_acc = accurracy_assessment(month_df)
            month_acc['month'] = month
            month_acc['n_records'] = month_df.shape[0]
            
        else:
            month_acc = dict(
                acc=np.nan,
                tn=np.nan,
                fp=np.nan,
                fn=np.nan,
                far=np.nan,
                pod=np.nan,
                csi=np.nan,
                month=month,
                n_records=0
            )
            
        month_results.append(month_acc)
            
    
    month_acc_df = pd.DataFrame(month_results).set_index('month')
    region_dfs.append(month_acc_df)

In [None]:
# region in a table with the time series of cloud cover for each region
clear_region_df = pd.read_csv(datadir / 'region_clear_ratios.csv')

In [None]:
clear_region_df.head()

In [None]:
import matplotlib.patches as mpatches

In [None]:
f, ax = plt.subplots(nrows=len(region_dfs), sharey=True,figsize=(12,15))
abcs = ['a','b','c','d','e', 'f']

acc = []
cc = []

for i in range(len(region_dfs)):
    region = regions[i]
    
    cloud_df = clear_region_df.loc[clear_region_df['region']==region].set_index('month')
    
    ax[i].set_title(f'{abcs[i]}) {region}',loc='left')
    region_dfs[i][['acc']].plot.bar(ax=ax[i],legend=False,label='Accuracy')
    axr = ax[i].twinx()
    (1-cloud_df['mean']).plot(ax=axr,color='k',label='Cloud Cover')
    axr.set_ylim(0,0.1)
    ax[i].set_ylim(0.7,1)
    ax[i].set_ylabel('')
    ax[i].set_xlabel('')
    axr.set_yticks([0,0.25,0.5,0.75, 1])
    ax[i].set_xticklabels(['' for i in range(12)])
    
    acc.append(region_dfs[i][['acc']].values.ravel())
    cc.append((1-cloud_df[['mean']]).values.ravel())
    
    if i == 0:
        # where some data has already been plotted to ax
        handles, labels = axr.get_legend_handles_labels()

        # manually define a new patch 
        patch = mpatches.Patch(color='C0', label='Accuracy')

        # handles is a list, so append manual patch
        handles.append(patch) 

        # plot the legend
        axr.legend(handles=handles, loc='upper right')

ax[-1].set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec',],rotation=45)

f.text(0.05, 0.5, 'Accuracy [%]', va='center', rotation=90)
f.text(0.95, 0.5, 'Average Cloud Cover [%]', va='center', rotation=270)

plt.savefig('accuracy_cloud_timeseries.png',bbox_inches='tight',dpi=200)
    
plt.show()

In [None]:
# concatenate the values for accuracy and cloud cover into arrays
acc_arr = np.concatenate(acc)
cc_arr = np.concatenate(cc)

In [None]:
# get the indices where values are finite
valid_idx = np.where(np.isfinite(acc_arr))

In [None]:
# print the correlation between accuracy and cloud cover
stats.pearsonr(acc_arr[valid_idx], cc_arr[valid_idx])