# Labels vs. precipitation

## Imports

In [None]:
import h5py
import os
import pandas as pd
import numpy as np
import sys
import altair as alt
from datetime import datetime, timedelta

In [None]:
sys.path.append('/'.join(os.getcwd().split('/')[:-1]) + '/library')
from searcher import Searcher

## Weather data

I'm renaming the columns because square brackets have a special meaning in Altair.

In [None]:
weather_df = pd.read_csv(
    '../data/weather-hourly.csv', 
    usecols=['datetime[utc]', 'precipitation[mm]'], 
    parse_dates=['datetime[utc]']
).rename(columns={'datetime[utc]': 'end_time', 'precipitation[mm]': 'precipitation'})

In [None]:
april_2017_weather = weather_df[
    (weather_df['end_time'].dt.year == 2017) &
    (weather_df['end_time'].dt.month == 4) &
    (weather_df['end_time'].dt.minute == 51)
].copy()

The start time is 59m 59s before the end time. This is because `Searcher.return_interval` uses inclusive start and end  times.

In [None]:
april_2017_weather['start_time'] = april_2017_weather['end_time'] - timedelta(seconds=60 * 60 - 1)

In [None]:
april_2017_weather.head()

In [None]:
weather_df_2017 =  weather_df[(weather_df['end_time'].dt.year == 2017) &
                              (weather_df['end_time'].dt.minute == 51)]
precipitation_df_2017 = weather_df_2017[weather_df_2017['precipitation'] > 0].copy()
hours_with_precipitation = precipitation_df_2017.shape[0]
hours_in_year = 24 * 365
print(f'hours with precipitation = {hours_with_precipitation}')
print(f'total hours = {hours_in_year}')
print(f'percentage of hours with precipitation = {hours_with_precipitation / hours_in_year:.2%}')

In [None]:
alt.Chart(precipitation_df_2017).mark_bar().encode(
    alt.X("precipitation:Q", bin=alt.Bin(maxbins=40), title='Precipitation (mm)'),
    y='count()',
)

## SONYC data

In [None]:
class Plotter:
    def __init__(self, node, searcher, weather_df):
        self.node = node
        self.searcher = searcher
        self.df = weather_df
        self.df['spl'] = self.df.apply(self.get_mean_spl, axis=1)


    def get_mean_spl(self, row):
        interval = self.searcher.return_interval(row['start_time'],
                                                 row['end_time'])
        info = self.searcher.information[interval['index'].values]
        return info['spl_vector'].mean()
    
    
    def scatter(self, data):
        return alt.Chart(data).mark_point().encode(
            x=alt.X('precipitation:Q',
                    title='Precipitation (mm)',
                    scale=alt.Scale(zero=False)),
            y=alt.Y('spl:Q',
                    title='Mean sound pressure level (dBA)',
                    scale=alt.Scale(zero=False)),
        ).properties(
            title=f'{self.node} - hourly amounts'
        ).interactive()


    def scatter_all_points(self):
        return self.scatter(self.df)


    def scatter_points_with_precipitation(self):
        return self.scatter(self.df[self.df['precipitation'] > 0])


    def linechart(self):
        rain_chart = alt.Chart(self.df).mark_line().encode(
            x=alt.X('end_time:T', title='Date'),
            y=alt.Y('precipitation:Q', title='Precipitation (mm)')
        ).properties(width=800)
        
        spl_chart = alt.Chart(self.df).mark_line().encode(
            x=alt.X('end_time:T', title='Date'),
            y=alt.Y('spl:Q', title='Sound pressure level (dBA)')
        ).properties(width=800)
        
        return alt.vconcat(rain_chart, spl_chart)

In [None]:
node1 = 'sonycnode-b827eb86d458.sonyc'
node3 = 'sonycnode-b827ebb40450.sonyc'
node4 = 'sonycnode-b827eb73e772.sonyc'

In [None]:
searcher1 = Searcher(node1)

In [None]:
searcher1.information.dtype

In [None]:
class Predictions:
    def __init__(self, node, year=2017):
        self.node = node
        index_path = f'../sonyc/indices/{year}/{node}_recording_index.h5'
        self.information = h5py.File(index_path, 'r')['recording_index']
        self.timestamps = pd.DataFrame(self.information['timestamp'], columns=['epoch'])
        
    def return_interval(self, start, stop=None):
        if stop is None:
            stop = start + pd.Timedelta(minutes=60)
            
        lower_bound = convert_to_epoch(start) <= self.timestamps['epoch']
        upper_bound = self.timestamps['epoch'] < convert_to_epoch(stop)
        
        interval = self.timestamps[lower_bound & upper_bound].reset_index()
        interval['utc'] = pd.to_datetime(
            interval['epoch'], 
            unit='s', 
            utc=True, 
            infer_datetime_format=True
        )
        

In [None]:
year = 2017
node = node1
predictions_path = f'../sonyc/class_predictions/1.0.0/{year}/{node}_class_predictions.h5'

In [None]:
h5file = h5py.File(predictions_path, 'r')

In [None]:
h5file.keys()

In [None]:
coarse = h5file['coarse']

In [None]:
coarse

In [None]:
searcher1.information.shape

In [None]:
coarse.dtype

In [None]:
coarse[0]

In [None]:
precipitation_df_2017.head()

In [None]:
precipitation_df_2017['start_time'] = precipitation_df_2017['end_time'] - timedelta(seconds=60 * 60)

In [None]:
precipitation_df_2017.shape

In [None]:
coarse_timestamps = pd.DataFrame(coarse['timestamp'], columns=['epoch'])

In [None]:
def convert_to_epoch(stamp):
    return (stamp - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')

In [None]:
def get_interval(self, start, stop=None):
    if stop is None:
        stop = start + pd.Timedelta(minutes=60)

    lower_bound = convert_to_epoch(start) < coarse_timestamps['epoch']
    upper_bound = coarse_timestamps['epoch'] <= convert_to_epoch(stop)

    interval = coarse_timestamps[lower_bound & upper_bound].reset_index()
    interval['utc'] = pd.to_datetime(
        interval['epoch'], 
        unit='s', 
        utc=True, 
        infer_datetime_format=True
    )
    
    return interval

In [None]:
coarse[0]

In [None]:
labels = list(coarse.dtype.names[2:])

In [None]:
labels

In [None]:
def get_mean_label(row, label):
    interval = get_interval(row['start_time'], row['end_time'])
    info = coarse[interval['index'].values]
    if info.shape[0] == 0:
        return -1
    return info[label].mean()

In [None]:
for label in labels:
    precipitation_df_2017[label] = precipitation_df_2017.apply(get_mean_label, axis=1, args=(label,))

In [None]:
precipitation_df_2017

In [None]:
precipitation_df_2017_valid = precipitation_df_2017[precipitation_df_2017['1_engine'] != -1]

In [None]:
precipitation_df_2017_valid

In [None]:
 def scatter(label):
    return alt.Chart(precipitation_df_2017_valid).mark_point().encode(
        x=alt.X('precipitation:Q',
                title='Precipitation (mm)',
                scale=alt.Scale(zero=False)),
        y=alt.Y(f'{label}:Q',
                title=f'Mean prediction for {label[2:]}',
                scale=alt.Scale(zero=False)),
    ).properties(
        title=f'{label[2:]}'
    ).interactive()

In [None]:
scatter(labels[0])

In [None]:
scatter(labels[1])

In [None]:
scatter(labels[2])

In [None]:
scatter(labels[3])

In [None]:
scatter(labels[4])

In [None]:
scatter(labels[5])

In [None]:
scatter(labels[6])

In [None]:
scatter(labels[7])