Statistical Analysis of Results
===============================
This will be an examnination of results. Just statistical analysis nothing else.

# Table of Contents
1. [Preliminaries](#1.-preliminaries)
2. [Kling-Gupta Efficiency](#2.-kling-gupta-efficiency)
   * [isoNet](#isoNet-KGE)
   * [isoP](#isoP-KGE)
3. [Root Mean Square Error](#3.-root-mean-square-error)
   * [isoNet](#isoNet-RMSE)
   * [isoP](#isoP-RMSE)
4. [Combining Results](#4.-combining-results)

# 1. Preliminaries
This is the setup for the rest of the analysis.

In [1]:
# Library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load in the isotope data that it was trained on, and extract the lat lon of the individual stations
isotope_data = pd.read_csv('Isoscape_Data.csv')
isotope_data.drop_duplicates(subset=['Station'], inplace=True)
isotope_data.reset_index()

station_coord = isotope_data[['Station', 'Lat', 'Long']]
station_coord.set_index('Station', inplace=True)
station_coord = station_coord.to_dict(orient='index')
station_coord

{'OTT': {'Lat': 45.32, 'Long': -75.67},
 'RES': {'Lat': 74.43, 'Long': -94.59},
 'HAL': {'Lat': 68.47, 'Long': -81.15},
 'ALR': {'Lat': 82.31, 'Long': -62.17},
 'EUR': {'Lat': 79.59, 'Long': -85.56},
 'CAM': {'Lat': 69.6, 'Long': -105.8},
 'BAB': {'Lat': 47.98, 'Long': -55.82},
 'SNA': {'Lat': 63.52, 'Long': -116.0},
 'SKT': {'Lat': 52.1, 'Long': -106.43},
 'ELA': {'Lat': 49.67, 'Long': -93.72},
 'SAT': {'Lat': 48.78, 'Long': -123.13},
 'HAB': {'Lat': 46.29, 'Long': -64.15},
 'CPA': {'Lat': 49.82, 'Long': -74.97},
 'BON': {'Lat': 49.38, 'Long': -82.12},
 'EGB': {'Lat': 44.23, 'Long': -79.77},
 'GOB': {'Lat': 53.32, 'Long': -60.42},
 'EST': {'Lat': 51.67, 'Long': -110.2}}

In [3]:
# Load in results into a pandas dataframe
results = pd.read_csv('results_test.csv')

# Convert the day of year column and year column into a datetime object
results['date'] = pd.to_datetime(results['Year'].astype(str) + '-' + results['Day'].astype(str), format='%Y-%j')

results = results.drop(columns=['Year', 'Day'])

# Create new column for the station name and fill it with the station name based off the lat lon and the station_coord dataframe
for stat in station_coord:
    results.loc[results['Lat'] == station_coord[stat]['Lat'], 'Station'] = stat

# Change the date to start on the first of each month, instead of the second
results.date = results.date - pd.Timedelta('1D')

results.head()

Unnamed: 0,Lat,Long,Alt,Precipitation (kg/m^2/s),Temperature (K),Predictions,Actual,date,Station
0,82.31,-62.17,30,0.0,240.15315,-32.329876,-34.07,2003-11-01,ALR
1,63.52,-116.0,241,0.0,267.63196,-27.996933,-23.35,2003-11-01,SNA
2,46.29,-64.15,45,5.6e-05,274.3795,-13.96246,-8.56,2003-11-01,HAB
3,79.59,-85.56,10,0.0,240.76923,-32.99603,-33.2,2003-11-01,EUR
4,45.32,-75.67,114,4.3e-05,274.7887,-15.856003,-11.1,2003-11-01,OTT


# 2. Kling-Gupta Efficiency
In this I will be breaking down the Kling-Gupta Efficiency (KGE) into its components and then analyzing the results. Finally also storing them in a dataframe for later examination.

## isoNet KGE

In [4]:
# Create a new dataframe consisting of the mean and standard deviation of the isotope values for each station, and for the Prediction column and Actual column
station_stats = results["Predictions"].groupby(results["Station"]).agg(['mean', 'std'])
station_stats.rename(columns={'mean': 'Pred_Mean', 'std': 'Pred_Std'}, inplace=True)
station_stats['Actual_Mean'] = results["Actual"].groupby(results["Station"]).mean()
station_stats['Actual_Std'] = results["Actual"].groupby(results["Station"]).std()

# Create a new column consisting of the pearson correlation coefficient between the Predictions and the Actual values for each station
station_stats['Corr'] = results["Predictions"].groupby(results["Station"]).corr(results["Actual"])
station_stats

Unnamed: 0_level_0,Pred_Mean,Pred_Std,Actual_Mean,Actual_Std,Corr
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALR,-30.059634,4.343389,-29.480897,6.405524,0.65897
BAB,-12.067058,4.113867,-9.901625,3.18038,0.734426
BON,-17.999346,5.233123,-15.68775,6.22946,0.830302
CAM,-24.703905,3.897933,-24.989722,6.3878,0.87686
CPA,-17.947773,5.092825,-15.81675,5.421716,0.81894
ELA,-18.842542,5.74387,-15.259167,5.625312,0.848231
EUR,-29.020843,4.758014,-29.439054,6.699601,0.76116
GOB,-18.367158,4.636298,-15.539875,3.850453,0.824792
HAB,-12.503292,3.909451,-11.1175,4.066346,0.575944
OTT,-13.281228,3.678216,-10.772545,4.058309,0.657954


In [5]:
# Create new dataframe just for the KGE components and values
kge = pd.DataFrame(columns=['alpha', 'beta', 'r', 'kge'], index=station_stats.index)

# Fill in the KGE dataframe with the alpha value (variablility ratio)
kge['alpha'] = station_stats['Pred_Std'] / station_stats['Actual_Std']

# Fill in the KGE dataframe with the beta value (bias ratio)
kge['beta'] = (station_stats['Pred_Mean'] - station_stats['Actual_Mean']) / station_stats['Actual_Mean']

# Fill in the KGE dataframe with the r value (correlation coefficient)
kge['r'] = station_stats['Corr']

# Fill in the KGE dataframe with the KGE value
kge['kge'] = 1 - np.sqrt((kge['alpha'] - 1)**2 + (kge['beta'])**2 + (1 - kge['r'])**2)

kge

Unnamed: 0_level_0,alpha,beta,r,kge
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ALR,0.678069,0.019631,0.65897,0.530611
BAB,1.293514,0.218695,0.734426,0.547775
BON,0.84006,0.14735,0.830302,0.724155
CAM,0.610215,-0.011437,0.87686,0.591067
CPA,0.939338,0.134732,0.81894,0.766301
ELA,1.021076,0.234834,0.848231,0.719598
EUR,0.710194,-0.014206,0.76116,0.624189
GOB,1.204092,0.181937,0.824792,0.675265
HAB,0.961416,0.12465,0.575944,0.556322
OTT,0.906342,0.232877,0.657954,0.575737


## isoP KGE
Performing the same calculations but this time just for the isoP results

In [6]:
# Load in Data from isoP
isoP = pd.read_csv('isoP_Output.csv')

# Change longitudes to be positive
isoP['Lon'] = isoP['Lon'] * -1

# Add in the station name to the isoP dataframe with the station_coord dataframe
for stat in station_coord:
    isoP.loc[isoP['Lat'] == station_coord[stat]['Lat'], 'Station'] = stat

# Combine Month and Year into a single column and convert to datetime object
isoP['date'] = pd.to_datetime(isoP['Year'].astype(str) + '-' + isoP['Month'].astype(str), format='%Y-%m')
isoP.drop(columns=['Year', 'Month'], inplace=True)

# Reorder the columns
isoP = isoP[['date', 'Station', 'Lat', 'Lon', 'isoP']]

# Remove any rows in isoP that do not have a corresponding row in the results dataframe
isoP = isoP[isoP['date'].isin(results['date'])]

# Merge the isoP dataframe with the results dataframe on the date and station columns, keeping only the actual and isoP columns
isoP = isoP.merge(results, on=['date', 'Station'], how='inner')

isoP.drop(columns=['Lat_y', 'Long', 'Precipitation (kg/m^2/s)', 'Temperature (K)', 'Predictions'], inplace=True)
isoP.rename(columns={'Lat_x': 'Lat'}, inplace=True)
isoP

Unnamed: 0,date,Station,Lat,Lon,isoP,Alt,Actual
0,2004-01-01,OTT,45.32,75.67,-20.112738,114,-18.920
1,2004-02-01,OTT,45.32,75.67,-15.957769,114,-15.080
2,2004-03-01,OTT,45.32,75.67,-15.134004,114,-10.890
3,2004-04-01,OTT,45.32,75.67,-12.415049,114,-10.990
4,2004-05-01,OTT,45.32,75.67,-9.781096,114,-7.080
...,...,...,...,...,...,...,...
345,2006-11-01,GOB,53.32,60.42,-15.551108,46,-13.550
346,2006-12-01,GOB,53.32,60.42,-18.446399,46,-22.410
347,2007-01-01,GOB,53.32,60.42,-19.344636,46,-23.070
348,2007-02-01,GOB,53.32,60.42,-19.597808,46,-20.530


In [7]:
# Now we can perform the same analysis as we did with the isoNet data
isoP_stats = isoP["isoP"].groupby(isoP["Station"]).agg(['mean', 'std'])
isoP_stats.rename(columns={'mean': 'isoP_Mean', 'std': 'isoP_Std'}, inplace=True)
isoP_stats['Actual_Mean'] = isoP["Actual"].groupby(isoP["Station"]).mean()
isoP_stats['Actual_Std'] = isoP["Actual"].groupby(isoP["Station"]).std()
isoP_stats['Corr'] = isoP["isoP"].groupby(isoP["Station"]).corr(isoP["Actual"])

isoP_stats

Unnamed: 0_level_0,isoP_Mean,isoP_Std,Actual_Mean,Actual_Std,Corr
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BAB,-9.492269,2.532441,-9.628816,3.001733,0.598408
BON,-15.47724,5.197228,-15.565263,6.353531,0.835785
CPA,-15.619627,4.416323,-15.722564,5.459338,0.912512
ELA,-16.149326,5.225621,-14.96075,5.598884,0.802588
GOB,-15.450212,3.688603,-15.465256,3.871378,0.851669
HAB,-10.316499,2.967529,-11.248875,4.124456,0.693154
OTT,-11.081726,3.678253,-10.701609,4.118469,0.833848
SAT,-8.835244,2.683577,-9.769615,2.046188,0.78667
SNA,-22.278962,4.271085,-22.721376,4.58436,-0.201335


In [8]:
# Now the KGE values
isoP_kge = pd.DataFrame(columns=['alpha', 'beta', 'r', 'kge'], index=isoP_stats.index)

# Fill in the KGE dataframe with the alpha value (variablility ratio)
isoP_kge['alpha'] = isoP_stats['isoP_Std'] / isoP_stats['Actual_Std']

# Fill in the KGE dataframe with the beta value (bias ratio)
isoP_kge['beta'] = (isoP_stats['isoP_Mean'] - isoP_stats['Actual_Mean']) / isoP_stats['Actual_Mean']

# Fill in the KGE dataframe with the r value (correlation coefficient)
isoP_kge['r'] = isoP_stats['Corr']

# Fill in the KGE dataframe with the KGE value
isoP_kge['kge'] = 1 - np.sqrt((isoP_kge['alpha'] - 1)**2 + (isoP_kge['beta'])**2 + (1 - isoP_kge['r'])**2)

isoP_kge

Unnamed: 0_level_0,alpha,beta,r,kge
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BAB,0.84366,-0.014181,0.598408,0.568816
BON,0.818006,-0.005655,0.835785,0.754806
CPA,0.808948,-0.006547,0.912512,0.789768
ELA,0.933333,0.079446,0.802588,0.777003
GOB,0.952788,-0.000973,0.851669,0.844334
HAB,0.719496,-0.082886,0.693154,0.576081
OTT,0.893112,0.03552,0.833848,0.799268
SAT,1.3115,-0.095641,0.78667,0.610527
SNA,0.931664,-0.019471,-0.201335,-0.203434


In [9]:
# Combine the isoNet and isoP KGE dataframes into a single dataframe
kge = pd.merge(isoP_kge, kge, left_index=True, right_index=True, suffixes=('_isoP', '_isoNet'))
kge = kge[['kge_isoP', 'kge_isoNet']]
kge

Unnamed: 0_level_0,kge_isoP,kge_isoNet
Station,Unnamed: 1_level_1,Unnamed: 2_level_1
BAB,0.568816,0.547775
BON,0.754806,0.724155
CPA,0.789768,0.766301
ELA,0.777003,0.719598
GOB,0.844334,0.675265
HAB,0.576081,0.556322
OTT,0.799268,0.575737
SAT,0.610527,-0.197748
SNA,-0.203434,-0.204393


# 3. Root Mean Square Error
In this I will be calculating the Root Mean Square Error (RMSE) 

## isoNet RMSE
Just for the current isoNet results

In [10]:
# In station_stats, calculate the RMSE for each station
station_stats['RMSE'] = np.sqrt(((results['Predictions'] - results['Actual']) ** 2).groupby(results['Station']).mean())
station_stats

Unnamed: 0_level_0,Pred_Mean,Pred_Std,Actual_Mean,Actual_Std,Corr,RMSE
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ALR,-30.059634,4.343389,-29.480897,6.405524,0.65897,4.792473
BAB,-12.067058,4.113867,-9.901625,3.18038,0.734426,3.509181
BON,-17.999346,5.233123,-15.68775,6.22946,0.830302,4.135083
CAM,-24.703905,3.897933,-24.989722,6.3878,0.87686,3.474298
CPA,-17.947773,5.092825,-15.81675,5.421716,0.81894,3.79415
ELA,-18.842542,5.74387,-15.259167,5.625312,0.848231,4.735868
EUR,-29.020843,4.758014,-29.439054,6.699601,0.76116,4.319507
GOB,-18.367158,4.636298,-15.539875,3.850453,0.824792,3.833385
HAB,-12.503292,3.909451,-11.1175,4.066346,0.575944,3.886647
OTT,-13.281228,3.678216,-10.772545,4.058309,0.657954,4.054937


## isoP RMSE
Now for the isoP results

In [11]:
isoP_stats['RMSE'] = np.sqrt(((isoP['isoP'] - isoP['Actual']) ** 2).groupby(isoP['Station']).mean())
isoP_stats

Unnamed: 0_level_0,isoP_Mean,isoP_Std,Actual_Mean,Actual_Std,Corr,RMSE
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BAB,-9.492269,2.532441,-9.628816,3.001733,0.598408,2.485556
BON,-15.47724,5.197228,-15.565263,6.353531,0.835785,3.445169
CPA,-15.619627,4.416323,-15.722564,5.459338,0.912512,2.276202
ELA,-16.149326,5.225621,-14.96075,5.598884,0.802588,3.579302
GOB,-15.450212,3.688603,-15.465256,3.871378,0.851669,2.039725
HAB,-10.316499,2.967529,-11.248875,4.124456,0.693154,3.081853
OTT,-11.081726,3.678253,-10.701609,4.118469,0.833848,2.294202
SAT,-8.835244,2.683577,-9.769615,2.046188,0.78667,1.884586
SNA,-22.278962,4.271085,-22.721376,4.58436,-0.201335,6.761137


# 4. Combining Results

In [12]:
# Create a dataframe for the RMSE values and the KGE values
final_stats = pd.DataFrame(columns=['RMSE_isoNet', 'RMSE_isoP', 'KGE_isoNet', 'KGE_isoP'], index=station_stats.index)

# Fill in the final_stats dataframe with the RMSE and KGE values
final_stats['RMSE_isoNet'] = station_stats['RMSE']
final_stats['RMSE_isoP'] = isoP_stats['RMSE']
final_stats['KGE_isoNet'] = kge['kge_isoNet']
final_stats['KGE_isoP'] = kge['kge_isoP']

final_stats

Unnamed: 0_level_0,RMSE_isoNet,RMSE_isoP,KGE_isoNet,KGE_isoP
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ALR,4.792473,,,
BAB,3.509181,2.485556,0.547775,0.568816
BON,4.135083,3.445169,0.724155,0.754806
CAM,3.474298,,,
CPA,3.79415,2.276202,0.766301,0.789768
ELA,4.735868,3.579302,0.719598,0.777003
EUR,4.319507,,,
GOB,3.833385,2.039725,0.675265,0.844334
HAB,3.886647,3.081853,0.556322,0.576081
OTT,4.054937,2.294202,0.575737,0.799268


In [13]:
# Export the final_stats dataframe to a csv file
final_stats.to_csv('results_stats.csv')