In [None]:
import h5py
import os
import pandas as pd
import numpy as np
import scipy.stats
import sys
import altair as alt
import math
from datetime import datetime, timedelta

In [None]:
sys.path.append('/'.join(os.getcwd().split('/')[:-1]) + '/library')
from searcher import Searcher

In [None]:
spl_columns = [
    'spl_vector',
    'spl_mean',
    'spl_std',
    'spl_l2diff',
    'spl_l2diff_hourly_pct',
    'spl_entropy'
]

In [None]:
def preprocess(df):
    df['node'] = df['path'].str.split('/').str[2]
    
    get_path = lambda x: f'../sonyc/indices/2017/{x}_recording_index.h5'
    
    df['index_path'] = df['node'].map(get_path)
    
    df.drop_duplicates(subset=['node_timestamp', 'node'], inplace=True)
    
    return df

In [None]:
def get_predictions(row, cache):
    if row['index_path'] != cache['path']:
        
        print(f"old path = {cache['path']}")
        print(f"new path = {row['index_path']}")
        
        cache['path'] = row['index_path']
        cache['data'] = h5py.File(cache['path'], 'r')['recording_index']
        cache['timestamps'] = pd.DataFrame(cache['data']['timestamp'], columns=['epoch'])
    
    timestamp = row['node_timestamp']
    
    index = cache['timestamps'][cache['timestamps']['epoch'] == timestamp].index[0]
    
    return list(cache['data'][index][spl_columns])

In [None]:
def setup_predictions(df):
    cache = {
        'path': None,
        'data': None,
        'timestamps': None
    }
    
    # sort by path so that we're not jumping around to different files
    df.sort_values(by=['index_path', 'node_timestamp'], inplace=True)
    
    prediction_df = df.apply(get_predictions,
                             axis='columns',
                             result_type='expand',
                             args=[cache])
    
    prediction_df.columns = spl_columns
    
    df_all = pd.concat([df, prediction_df], axis=1)
    
    # undo the sorting by path to get the original order
    df_all.sort_index(inplace=True)
    
    return df_all

In [None]:
def output_predictions(df, suffix):
    columns_to_output = ['node_timestamp', 'precipitation[mm]', 'node'] + spl_columns[1:]
    df.to_csv(f'../data/spl-{suffix}.csv',
              columns=columns_to_output,
              index=False)
    
    df.to_pickle(f'../data/spl-{suffix}.pkl')

In [None]:
DIFF = 30
SEED = 2660280232880537243 % 2**32
N = 19000

rainy = pd.read_csv('../data/audio-paths-rained.csv')
rainy_reduced = rainy[rainy['diff'].abs() <= DIFF].sample(N, random_state=SEED)

nonrainy = pd.read_csv('../data/audio-paths-nonrained.csv')
nonrainy_reduced = nonrainy[nonrainy['diff'].abs() <= DIFF].sample(N, random_state=SEED)

data = (pd.concat((rainy_reduced, nonrainy_reduced))
            .sample(frac=1, random_state=SEED)
            .reset_index(drop=True))

In [None]:
data

In [None]:
preprocessed_data = preprocess(data.copy())

In [None]:
preprocessed_data

In [None]:
df = setup_predictions(preprocessed_data.copy())

In [None]:
df

In [None]:
num_train = math.floor(df.shape[0] * .7)
train_df = df.iloc[:num_train]
test_df = df.iloc[num_train:]

In [None]:
output_predictions(train_df, 'train')
output_predictions(test_df, 'test')