Statistical Analysis of Results
===============================
This will be an examnination of results. Just statistical analysis nothing else.

# Table of Contents
1. [Preliminaries](#1.-preliminaries)
2. [Kling-Gupta Efficiency](#2.-kling-gupta-efficiency)
   * [isoNet](#isoNet)
   * [isoP](#isoP)
3. [Root Mean Square Error](#3.-root-mean-square-error)
   * [isoNet](#isoNet)
   * [isoP](#isoP)
4. [Combining Results](#4.-combining-results)

# 1. Preliminaries
This is the setup for the rest of the analysis.

In [None]:
# Library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Load in the isotope data that it was trained on, and extract the lat lon of the individual stations
isotope_data = pd.read_csv('Isoscape_Data.csv')
isotope_data.drop_duplicates(subset=['Station'], inplace=True)
isotope_data.reset_index()

station_coord = isotope_data[['Station', 'Lat', 'Long']]
station_coord.set_index('Station', inplace=True)
station_coord = station_coord.to_dict(orient='index')

In [None]:
# Load in results into a pandas dataframe
results = pd.read_csv('results_test.csv')

# Convert the day of year column and year column into a datetime object
results['date'] = pd.to_datetime(results['Year'].astype(str) + '-' + results['Day'].astype(str), format='%Y-%j')
results = results.set_index('date')
results = results.drop(columns=['Year', 'Day'])

# Create new column for the station name and fill it with the station name based off the lat lon and the station_coord dataframe
for stat in station_coord:
    results.loc[results['Lat'] == station_coord[stat]['Lat'], 'Station'] = stat

# Change the date to start on the first of each month, instead of the second
results.index = results.index - pd.Timedelta('1D')

results.head()

# 2. Kling-Gupta Efficiency
In this I will be breaking down the Kling-Gupta Efficiency (KGE) into its components and then analyzing the results. Finally also storing them in a dataframe for later examination.

## isoNet

In [None]:
# Create a new dataframe consisting of the mean and standard deviation of the isotope values for each station, and for the Prediction column and Actual column
station_stats = results["Predictions"].groupby(results["Station"]).agg(['mean', 'std'])
station_stats.rename(columns={'mean': 'Pred_Mean', 'std': 'Pred_Std'}, inplace=True)
station_stats['Actual_Mean'] = results["Actual"].groupby(results["Station"]).mean()
station_stats['Actual_Std'] = results["Actual"].groupby(results["Station"]).std()

# Create a new column consisting of the pearson correlation coefficient between the Predictions and the Actual values for each station
station_stats['Corr'] = results["Predictions"].groupby(results["Station"]).corr(results["Actual"])
station_stats

In [None]:
# Create new dataframe just for the KGE components and values
kge = pd.DataFrame(columns=['alpha', 'beta', 'r', 'kge'], index=station_stats.index)

# Fill in the KGE dataframe with the alpha value (variablility ratio)
kge['alpha'] = station_stats['Pred_Std'] / station_stats['Actual_Std']

# Fill in the KGE dataframe with the beta value (bias ratio)
kge['beta'] = (station_stats['Pred_Mean'] - station_stats['Actual_Mean']) / station_stats['Actual_Mean']

# Fill in the KGE dataframe with the r value (correlation coefficient)
kge['r'] = station_stats['Corr']

# Fill in the KGE dataframe with the KGE value
kge['kge'] = 1 - np.sqrt((kge['alpha'] - 1)**2 + (kge['beta'])**2 + (1 - kge['r'])**2)

kge

## isoP
Performing the same calculations but this time just for the isoP results

In [None]:
# Load in Data from isoP
isoP = pd.read_csv('isoP_Output.csv')

# Change longitudes to be positive
isoP['Lon'] = isoP['Lon'] * -1

# Add in the station name to the isoP dataframe with the station_coord dataframe
for stat in station_coord:
    isoP.loc[isoP['Lat'] == station_coord[stat]['Lat'], 'Station'] = stat

# Combine Month and Year into a single column and convert to datetime object
isoP['date'] = pd.to_datetime(isoP['Year'].astype(str) + '-' + isoP['Month'].astype(str), format='%Y-%m')
isoP.drop(columns=['Year', 'Month'], inplace=True)

# Reorder the columns
isoP = isoP[['date', 'Station', 'Lat', 'Lon', 'isoP']]

# Limit the isoP dataframe to the same dates as the results dataframe
isoP = isoP[isoP['date'] >= results.index[0]]
isoP = isoP[isoP['date'] <= results.index[-1]]
isoP = isoP.set_index('date')


In [None]:
# Displaying all rows in results dataframe that are not in isoP dataframe
results[~results.index.isin(isoP.index)]

# 3. Root Mean Square Error
In this I will be calculating the Root Mean Square Error (RMSE) 

In [None]:
# In station_stats, calculate the RMSE for each station
station_stats['RMSE'] = np.sqrt(((results['Predictions'] - results['Actual']) ** 2).groupby(results['Station']).mean())
station_stats

# 4. Combining Results

In [None]:
# Create a new dataframe for the RMSE and KGE values for each station and save it to a csv file
results_stats = pd.concat([station_stats, kge], axis=1)
results_stats.drop(columns=['Pred_Mean', 'Pred_Std', 'Actual_Mean', 'Actual_Std', 'Corr'], inplace=True)

# Add in the lat and lon values for each station as the first two columns
results_stats['Lat'] = [station_coord[stat]['Lat'] for stat in results_stats.index]
results_stats['Long'] = [station_coord[stat]['Long'] for stat in results_stats.index]
results_stats = results_stats[['Lat', 'Long', 'RMSE', 'alpha', 'beta', 'r', 'kge']]

results_stats.to_csv('results_stats.csv')