In [None]:
import h5py
import os
import pandas as pd
import numpy as np
import sys
import altair as alt
import math
from datetime import datetime, timedelta

In [None]:
sys.path.append('/'.join(os.getcwd().split('/')[:-1]) + '/library')
from searcher import Searcher

In [None]:
labels = [
    '1_engine', '2_machinery-impact',
    '3_non-machinery-impact', '4_powered-saw',
    '5_alert-signal', '6_music',
    '7_human-voice', '8_dog'
]

In [None]:
def preprocess(df):
    df['node'] = df['path'].str.split('/').str[2]
    
    get_path = lambda x: f'../sonyc/class_predictions/1.0.0/2017/{x}_class_predictions.h5'
    
    df['prediction_path'] = df['node'].map(get_path)
    
    df.drop_duplicates(subset=['node_timestamp', 'node'], inplace=True)
    
    return df

In [None]:
def get_predictions(row, cache, labels):
    if row['prediction_path'] != cache['path']:
        cache['path'] = row['prediction_path']
        cache['data'] = h5py.File(cache['path'], 'r')['coarse']
        cache['timestamps'] = pd.DataFrame(cache['data']['timestamp'], columns=['epoch'])
        print(f"new path = {cache['path']}")

    
    timestamp = row['node_timestamp']
    
    index = cache['timestamps'][cache['timestamps']['epoch'] == timestamp].index[0]
    
    return list(cache['data'][index][labels])

In [None]:
def setup_predictions(df):
    cache = {
        'path': None,
        'data': None,
        'timestamps': None
    }
    
    # sort by path so that we're not jumping around to different files
    df.sort_values(by=['prediction_path', 'node_timestamp'], inplace=True)
    
    prediction_df = df.apply(get_predictions,
                             axis='columns',
                             result_type='expand',
                             args=[cache, labels])
    
    prediction_df.columns = labels
    
    df_all = pd.concat([df, prediction_df], axis=1)
    
    # undo the sorting by path to get the original order
    df_all.sort_index(inplace=True)
    
    return df_all

In [None]:
def output_predictions(df, suffix):
    columns_to_output = ['node_timestamp', 'precipitation[mm]', 'node'] + labels
    clean_labels = [w[2:] for w in labels]
    header_to_output = ['node_timestamp', 'precipitation[mm]', 'node'] + clean_labels
    df.to_csv(f'../data/predictions-{suffix}.csv',
              columns=columns_to_output,
              index=False,
              header=header_to_output)

In [None]:
DIFF = 30
SEED = 2660280232880537243 % 2**32
N = 19000

rainy = pd.read_csv('../data/audio-paths-rained.csv')
rainy_reduced = rainy[rainy['diff'].abs() <= DIFF].sample(N, random_state=SEED)

nonrainy = pd.read_csv('../data/audio-paths-nonrained.csv')
nonrainy_reduced = nonrainy[nonrainy['diff'].abs() <= DIFF].sample(N, random_state=SEED)

data = (pd.concat((rainy_reduced, nonrainy_reduced))
            .sample(frac=1, random_state=SEED)
            .reset_index(drop=True))

In [None]:
data

In [None]:
df_pre = preprocess(data)

In [None]:
df = setup_predictions(df_pre)

In [None]:
df

In [None]:
num_train = math.floor(df.shape[0] * .7)
train = df.iloc[:num_train]
test = df.iloc[num_train:]

In [None]:
output_predictions(train, 'train')
output_predictions(test, 'test')