In [None]:
import h5py
import os
import pandas as pd
import numpy as np
import sys
import altair as alt
import math
from datetime import datetime, timedelta

In [None]:
sys.path.append('/'.join(os.getcwd().split('/')[:-1]) + '/library')
from searcher import Searcher

In [None]:
coarse_labels = [
    '1_engine', '2_machinery-impact',
    '3_non-machinery-impact', '4_powered-saw',
    '5_alert-signal', '6_music',
    '7_human-voice', '8_dog'
]

fine_labels = [
    '1-1_small-sounding-engine',
    '1-2_medium-sounding-engine',
    '1-3_large-sounding-engine',
    '2-1_rock-drill',
    '2-2_jackhammer',
    '2-3_hoe-ram',
    '2-4_pile-driver',
    '3-1_non-machinery-impact',
    '4-1_chainsaw',
    '4-2_small-medium-rotating-saw',
    '4-3_large-rotating-saw',
    '5-1_car-horn',
    '5-2_car-alarm',
    '5-3_siren',
    '5-4_reverse-beeper',
    '6-1_stationary-music',
    '6-2_mobile-music',
    '6-3_ice-cream-truck',
    '7-1_person-or-small-group-talking',
    '7-2_person-or-small-group-shouting',
    '7-3_large-crowd',
    '7-4_amplified-speech',
    '8-1_dog-barking-whining'
]

labels = {
    'coarse': coarse_labels,
    'fine': fine_labels
}

In [None]:
def preprocess(df):
    df['node'] = df['path'].str.split('/').str[2]
    
    get_path = lambda x: f'../sonyc/class_predictions/1.0.0/2017/{x}_class_predictions.h5'
    
    df['prediction_path'] = df['node'].map(get_path)
    
    df.drop_duplicates(subset=['node_timestamp', 'node'], inplace=True)
    
    return df

In [None]:
def get_predictions(row, cache, granularity):
    if row['prediction_path'] != cache['path']:
        cache['path'] = row['prediction_path']
        cache['data'] = h5py.File(cache['path'], 'r')[granularity]
        cache['timestamps'] = pd.DataFrame(cache['data']['timestamp'], columns=['epoch'])
        print(f"new path = {cache['path']}")
    
    timestamp = row['node_timestamp']
    
    index = cache['timestamps'][cache['timestamps']['epoch'] == timestamp].index[0]
    
    return list(cache['data'][index][labels[granularity]])

In [None]:
def setup_predictions(df, granularity):
    cache = {
        'path': None,
        'data': None,
        'timestamps': None
    }
    
    # sort by path so that we're not jumping around to different files
    df.sort_values(by=['prediction_path', 'node_timestamp'], inplace=True)
    
    prediction_df = df.apply(get_predictions,
                             axis='columns',
                             result_type='expand',
                             args=[cache, granularity])
    
    prediction_df.columns = labels[granularity]
    
    df_all = pd.concat([df, prediction_df], axis=1)
    
    # undo the sorting by path to get the original order
    df_all.sort_index(inplace=True)
    
    return df_all

In [None]:
def output_predictions(df, suffix, granularity):
    columns_to_output = ['node_timestamp', 'precipitation[mm]', 'node'] + labels[granularity]
    clean_labels = [w.split('_')[1] for w in labels[granularity]]
    header_to_output = ['node_timestamp', 'precipitation[mm]', 'node'] + clean_labels
    df.to_csv(f'../data/predictions-{granularity}-{suffix}.csv',
              columns=columns_to_output,
              index=False,
              header=header_to_output)

In [None]:
DIFF = 30
SEED = 2660280232880537243 % 2**32
N = 19000

rainy = pd.read_csv('../data/audio-paths-rained.csv')
rainy_reduced = rainy[rainy['diff'].abs() <= DIFF].sample(N, random_state=SEED)

nonrainy = pd.read_csv('../data/audio-paths-nonrained.csv')
nonrainy_reduced = nonrainy[nonrainy['diff'].abs() <= DIFF].sample(N, random_state=SEED)

data = (pd.concat((rainy_reduced, nonrainy_reduced))
            .sample(frac=1, random_state=SEED)
            .reset_index(drop=True))

In [None]:
data

In [None]:
df_coarse = setup_predictions(preprocess(data.copy()), 'coarse')

In [None]:
df_coarse

In [None]:
num_train = math.floor(df_coarse.shape[0] * .7)
train_coarse = df_coarse.iloc[:num_train]
test_coarse = df_coarse.iloc[num_train:]

In [None]:
output_predictions(train_coarse, 'train', 'coarse')
output_predictions(test_coarse, 'test', 'coarse')

In [None]:
df_fine = setup_predictions(preprocess(data.copy()), 'fine')

In [None]:
df_fine

In [None]:
num_train = math.floor(df_fine.shape[0] * .7)
train_fine = df_fine.iloc[:num_train]
test_fine = df_fine.iloc[num_train:]

In [None]:
output_predictions(train_fine, 'train', 'fine')
output_predictions(test_fine, 'test', 'fine')