<center>

# WRA Data Model User Workshop -
# Floating lidar demonstration
## by 
## Daniel Nuno, 29th June 2023

</center>

This is a notebook demonstrating some of the functionalities of the Task 43 WRA Data Model and how to use it with real floating lidar data. This Notebook was presentated during the workshop on the 29th June 2023. The recording of this is below.

https://www.youtube.com/watch?v=MoKDz1FptDA&t=1555s

The data used here is publicly available for download at: https://oswbuoysny.resourcepanorama.dnv.com/. Download both the 'E06 Hudson South 10 Minute' and 'E06 Hudson South Hourly' data and save them in the same location as this notebook on your local machine. The 10-min data file contains the lidar measurements and the hourly one contains the ADCP measurements.

In [None]:
import json
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Function to explore Task43 Metadata Model

In [None]:
def find_nested_keys(json_data, keys_list):
    if isinstance(json_data, dict):
        if keys_list and json_data.get(keys_list[0]):
            if len(keys_list) == 1:
                yield json_data[keys_list[0]]
            else:
                for result in find_nested_keys(json_data[keys_list[0]], keys_list[1:]):
                    yield result
    elif isinstance(json_data, list):
        for item in json_data:
            for result in find_nested_keys(item, keys_list):
                yield result

## Load Metadata Model

In [None]:
# List types of measurements
fname = '../demo_data/E06_wraMetaData.json' # this file is located in the 'demo_data' folder of the GitHub repository.
with open(fname, 'r') as f:
    data = json.load(f)

# Listing and data overview

## List types of measurements

In [None]:
measType = list(find_nested_keys(data, ["measurement_location", "measurement_point", "measurement_type_id"]))
print(np.unique(measType))

## List depths/heights with measurements

In [None]:
heights = np.unique(list(find_nested_keys(data, ["measurement_location", "measurement_point", "height_m"])))
print(heights)

## List Measurement Points

In [None]:
measPoints = list(find_nested_keys(data, ["measurement_location", "measurement_point", "name"]))
print(len(measPoints))
print(measPoints)

## List column names, grouped by Measurement Points

In [None]:
varNames = list(find_nested_keys(data, ["measurement_location", "measurement_point", "logger_measurement_config", "column_name", "column_name"]))
varNamesGroup = list(find_nested_keys(data, ["measurement_location", "measurement_point", "logger_measurement_config", "column_name"]))
print(varNamesGroup)

## List loggers

In [None]:
# List sensors
sensorList = list(find_nested_keys(data, ["measurement_location", "logger_main_config", "logger_name"]))
print(sensorList)

# Integrity tests for data

## Load data

In [None]:
df10min = pd.read_csv('E06_Hudson_South_10_min_avg_20190904_20220327.csv', engine='python').set_index('timestamp').apply(pd.to_numeric, errors='coerce')
df10min.index = pd.to_datetime(df10min.index, format='%m-%d-%Y %H:%M')
df10min.columns = df10min.columns.str.strip()
df1h = pd.read_csv('E06_Hudson_South_hourly_avg_20190904_20220327.csv', engine='python', skiprows=[14322]).set_index('timestamp').apply(pd.to_numeric, errors='coerce') # There is an invalid line in the 1h file, so we skip the 14322 row
df1h.index = pd.to_datetime(df1h.index, format='%m-%d-%Y %H:%M')
df1h.columns = df1h.columns.str.strip()

## Variable names in Model but not in Data

In [None]:
print([x for x in varNames if (x not in df10min.columns) and (x not in df1h.columns)])

## Variable names in Data but not in Model

In [None]:
print([x for x in df10min.columns if x not in varNames])

## List column names for wind speed

In [None]:
windSpeedMeasPoints = [x for i, x in enumerate(varNamesGroup) if measType[i] == 'wind_speed']
windSpeedNames = [x['column_name'] for sublist in windSpeedMeasPoints for x in sublist if x['statistic_type_id'] == 'avg']
lidarWindSpeedNames = [x for x in windSpeedNames if 'LIDAR' in x.upper()]
metSpeedNames = [x for x in windSpeedNames if x not in lidarWindSpeedNames]
print('Lidar HWS column names: ', lidarWindSpeedNames)
print('Meteo HWS column names: ', metSpeedNames)

# Analyze performance of different lidars in campaign

## Function to plot correlation

In [None]:
def plotCorrs(df10min):
    # Drop rows where either of the two columns is NaN
    col1 = lidarWindSpeedNames[0]
    col2 = metSpeedNames[0]
    df_clean = df10min.dropna(subset=[col1, col2])

    # Create a blue scatter plot with a red regression line
    sns.regplot(data=df_clean, x=col1, y=col2, color='b', line_kws={'color': 'r'})
    plt.title('Scatter plot: Wind speed correlation between first two heights')
    plt.xlabel(col1)
    plt.ylabel(col2)

    # Calculate the correlation coefficient and square it to get R^2
    corr_coef = np.corrcoef(df_clean[col1], df_clean[col2])[0,1]
    r_squared = corr_coef**2
    plt.text(0.1, 0.9, f'R² = {r_squared:.3f}', transform=plt.gca().transAxes)

    plt.show()

## Correlation for full campaign

In [None]:
plotCorrs(df10min)

## Obtainin date ranges for each LIDAR and plot Correlations

In [None]:
# First identify which loggers are LIDARs
lidarLoggers = [i for i, x in enumerate(sensorList) if 'LIDAR' in x.upper()]
# get dates for each LIDAR
dateRange1 = [pd.to_datetime(list(find_nested_keys(data, ["measurement_location", "logger_main_config", "date_from"]))[lidarLoggers[0]]), pd.to_datetime(list(find_nested_keys(data, ["measurement_location", "logger_main_config", "date_to"]))[lidarLoggers[0]])]
dateRange2 = [pd.to_datetime(list(find_nested_keys(data, ["measurement_location", "logger_main_config", "date_from"]))[lidarLoggers[1]]), pd.to_datetime(list(find_nested_keys(data, ["measurement_location", "logger_main_config", "date_to"]))[lidarLoggers[1]])]

In [None]:
plotCorrs(df10min[dateRange1[0]:dateRange1[1]])
plotCorrs(df10min[dateRange2[0]:dateRange2[1]])


## Statistics for each LIDAR

In [None]:
print('Statistics for 1st LIDAR \n')
print(df10min[dateRange1[0]:dateRange1[1]].loc[:,[lidarWindSpeedNames[0], lidarWindSpeedNames[1]]].describe())
print('\n\n Statistics for 2nd LIDAR \n')
print(df10min[dateRange2[0]:dateRange2[1]].loc[:,[lidarWindSpeedNames[0], lidarWindSpeedNames[1]]].describe())


## Availability Plots

### Get temporal resolution

In [None]:
pointsInDay = int( 24 * 60 / list(find_nested_keys(data, ["measurement_location", "logger_main_config", "averaging_period_minutes"]))[0] )
print(pointsInDay)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(20,10)
data = (1-df10min[lidarWindSpeedNames].isna().rolling(pointsInDay).sum().resample('1D').max()/pointsInDay).T
sns.heatmap(data, cmap='viridis_r', ax=ax, xticklabels=50)
ax.invert_yaxis()
ax.tick_params(axis='both', which='major', labelsize=12)
labels = ax.set_xticklabels([pd.to_datetime(str(date)).strftime('%Y-%m-%d') for date in data.columns][0::50], rotation=45, ha='right')

