Statistical Analysis of Results
===============================
This will be an examnination of results. Just statistical analysis nothing else.

# Table of Contents
1. [Preliminaries](#1.-preliminaries)
2. [Kling-Gupta Efficiency](#2.-kling-gupta-efficiency)
3. [Root Mean Square Error](#3.-root-mean-square-error)
4. [Combining Results](#4.-combining-results)

# 1. Preliminaries
This is the setup for the rest of the analysis.

In [11]:
# Library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [12]:
# Load in the isotope data that it was trained on, and extract the lat lon of the individual stations
isotope_data = pd.read_csv('Isoscape_Data.csv')
isotope_data.drop_duplicates(subset=['Station'], inplace=True)
isotope_data.reset_index()

station_coord = isotope_data[['Station', 'Lat', 'Long']]
station_coord.set_index('Station', inplace=True)
station_coord = station_coord.to_dict(orient='index')

In [13]:
# Load in results into a pandas dataframe
results = pd.read_csv('results_test.csv')

# Convert the day of year column and year column into a datetime object
results['date'] = pd.to_datetime(results['Year'].astype(str) + '-' + results['Day'].astype(str), format='%Y-%j')
results = results.set_index('date')
results = results.drop(columns=['Year', 'Day'])

# Create new column for the station name and fill it with the station name based off the lat lon and the station_coord dataframe
for stat in station_coord:
    results.loc[results['Lat'] == station_coord[stat]['Lat'], 'Station'] = stat

results.head()

Unnamed: 0_level_0,Lat,Long,Alt,Precipitation (kg/m^2/s),Temperature (K),Predictions,Actual,Station
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2003-12-02,47.98,55.82,190.0,0.0,271.142517,-12.983921,-16.59,BAB
2003-12-02,49.38,82.12,245.0,0.0,265.255096,-20.723955,-20.07,BON
2003-12-02,46.29,64.15,45.0,0.0,270.236603,-14.407454,-8.42,HAB
2003-12-02,45.32,75.67,114.0,0.0,269.147888,-13.856897,-13.85,OTT
2003-12-02,48.78,123.13,178.0,0.0,252.437027,-16.355522,-12.78,SAT


In [14]:
test = results.groupby('Station').mean()
test

Unnamed: 0_level_0,Lat,Long,Alt,Precipitation (kg/m^2/s),Temperature (K),Predictions,Actual
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BAB,47.98,55.82,190.0,5e-06,280.590697,-11.221481,-9.807308
BON,49.38,82.12,245.0,1.5e-05,273.938788,-17.060243,-15.680769
CPA,49.82,74.97,382.0,1.2e-05,275.636155,-16.610241,-15.81675
ELA,49.67,93.72,369.0,5e-06,268.03968,-17.083738,-14.96075
GOB,53.32,60.42,46.0,7e-06,275.038214,-16.503662,-15.465256
HAB,46.29,64.15,45.0,4e-06,281.935077,-12.32424,-11.179878
OTT,45.32,75.67,114.0,1.1e-05,282.789378,-12.213251,-10.765862
SAT,48.78,123.13,178.0,1.4e-05,271.861776,-11.620949,-9.844875
SNA,63.52,116.0,241.0,5e-06,264.315511,-21.940439,-22.307219


# 2. Kling-Gupta Efficiency
In this I will be breaking down the Kling-Gupta Efficiency (KGE) into its components and then analyzing the results. Finally also storing them in a dataframe for later examination.

In [15]:
# Create a new dataframe consisting of the mean and standard deviation of the isotope values for each station, and for the Prediction column and Actual column
station_stats = results["Predictions"].groupby(results["Station"]).agg(['mean', 'std'])
station_stats.rename(columns={'mean': 'Pred_Mean', 'std': 'Pred_Std'}, inplace=True)
station_stats['Actual_Mean'] = results["Actual"].groupby(results["Station"]).mean()
station_stats['Actual_Std'] = results["Actual"].groupby(results["Station"]).std()

# Create a new column consisting of the pearson correlation coefficient between the Predictions and the Actual values for each station
station_stats['Corr'] = results["Predictions"].groupby(results["Station"]).corr(results["Actual"])
station_stats

Unnamed: 0_level_0,Pred_Mean,Pred_Std,Actual_Mean,Actual_Std,Corr
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BAB,-11.221481,3.170715,-9.807308,3.164775,0.526916
BON,-17.060243,4.937385,-15.680769,6.310736,0.523721
CPA,-16.610241,4.562381,-15.81675,5.421716,0.510102
ELA,-17.083738,5.581105,-14.96075,5.598884,0.531353
GOB,-16.503662,4.576316,-15.465256,3.871378,0.531006
HAB,-12.32424,2.995911,-11.179878,4.096467,0.520638
OTT,-12.213251,3.263574,-10.765862,4.100087,0.524596
SAT,-11.620949,2.789616,-9.844875,2.075112,0.445449
SNA,-21.940439,3.76756,-22.307219,4.420089,-0.369012


In [16]:
# Create new dataframe just for the KGE components and values
kge = pd.DataFrame(columns=['alpha', 'beta', 'r', 'kge'], index=station_stats.index)

# Fill in the KGE dataframe with the alpha value (variablility ratio)
kge['alpha'] = station_stats['Pred_Std'] / station_stats['Actual_Std']

# Fill in the KGE dataframe with the beta value (bias ratio)
kge['beta'] = (station_stats['Pred_Mean'] - station_stats['Actual_Mean']) / station_stats['Actual_Mean']

# Fill in the KGE dataframe with the r value (correlation coefficient)
kge['r'] = station_stats['Corr']

# Fill in the KGE dataframe with the KGE value
kge['kge'] = 1 - np.sqrt((kge['alpha'] - 1)**2 + (kge['beta'])**2 + (1 - kge['r'])**2)

kge

Unnamed: 0_level_0,alpha,beta,r,kge
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BAB,1.001877,0.144196,0.526916,0.505425
BON,0.782379,0.087972,0.523721,0.46902
CPA,0.841501,0.050168,0.510102,0.482661
ELA,0.996825,0.141904,0.531353,0.51033
GOB,1.18209,0.067144,0.531006,0.492436
HAB,0.73134,0.102359,0.520638,0.441033
OTT,0.795977,0.134442,0.524596,0.465482
SAT,1.34432,0.180406,0.445449,0.322778
SNA,0.852372,-0.016442,-0.369012,-0.377047


# 3. Root Mean Square Error
In this I will be calculating the Root Mean Square Error (RMSE) 

In [17]:
# In station_stats, calculate the RMSE for each station
station_stats['RMSE'] = np.sqrt(((results['Predictions'] - results['Actual']) ** 2).groupby(results['Station']).mean())
station_stats

Unnamed: 0_level_0,Pred_Mean,Pred_Std,Actual_Mean,Actual_Std,Corr,RMSE
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BAB,-11.221481,3.170715,-9.807308,3.164775,0.526916,2.978021
BON,-17.060243,4.937385,-15.680769,6.310736,0.523721,3.741034
CPA,-16.610241,4.562381,-15.81675,5.421716,0.510102,3.071857
ELA,-17.083738,5.581105,-14.96075,5.598884,0.531353,3.440999
GOB,-16.503662,4.576316,-15.465256,3.871378,0.531006,2.917082
HAB,-12.32424,2.995911,-11.179878,4.096467,0.520638,3.252363
OTT,-12.213251,3.263574,-10.765862,4.100087,0.524596,3.143756
SAT,-11.620949,2.789616,-9.844875,2.075112,0.445449,2.592301
SNA,-21.940439,3.76756,-22.307219,4.420089,-0.369012,5.649878


# 4. Combining Results

In [20]:
# Create a new dataframe for the RMSE and KGE values for each station and save it to a csv file
results_stats = pd.concat([station_stats, kge], axis=1)
results_stats.drop(columns=['Pred_Mean', 'Pred_Std', 'Actual_Mean', 'Actual_Std', 'Corr'], inplace=True)

# Add in the lat and lon values for each station as the first two columns
results_stats['Lat'] = [station_coord[stat]['Lat'] for stat in results_stats.index]
results_stats['Long'] = [station_coord[stat]['Long'] for stat in results_stats.index]
results_stats = results_stats[['Lat', 'Long', 'RMSE', 'alpha', 'beta', 'r', 'kge']]

results_stats.to_csv('results_stats.csv')