# Investigate the contents of new test data database

A new test set has been made available. Investigate the contents of this test data to make sure it doesn't contain the same data as the training set. If it does, these data will need remnoving from the training set. 

In [42]:
from pathlib import Path
BASE_PATH = Path.cwd().parent.parent
data_path = BASE_PATH / 'data'

import pandas as pd
import sys
sys.path.append(str(BASE_PATH))
from src import utils


In [43]:
test_csv = pd.read_csv(data_path / 'raw' / 'SingleSpecies_all.csv', low_memory=False)
test_csv.columns

Index(['organization', 'project_id', 'location', 'location_id',
       'location_buffer_m', 'longitude', 'latitude', 'equipment_make',
       'equipment_model', 'recording_id', 'recording_date_time', 'task_id',
       'aru_task_status', 'task_duration', 'task_method', 'species_code',
       'species_common_name', 'species_scientific_name', 'individual_order',
       'tag_id', 'individual_count', 'vocalization', 'detection_time',
       'tag_duration', 'rms_peak_dbfs', 'tag_is_verified', 'tag_rating',
       'observer', 'observer_id', 'species_individual_comments',
       'task_comments', 'wildtrax_url', 'recording_url', 'target'],
      dtype='object')

In [44]:
test_csv.vocalization.value_counts()

vocalization
Song         25222
Non-vocal     5326
Call           226
Name: count, dtype: int64

In [45]:
osfls = test_csv.loc[test_csv.species_code == 'OSFL']
osfls.vocalization.value_counts()

vocalization
Song    301
Call    110
Name: count, dtype: int64

In [46]:
osfl_songs = osfls.loc[osfls.vocalization == 'Song']

In [47]:
utils.keep_cols

['organization',
 'project_id',
 'location_id',
 'recording_id',
 'recording_date_time',
 'species_code',
 'species_common_name',
 'detection_time',
 'task_duration',
 'tag_duration',
 'tag_id',
 'recording_url',
 'task_method',
 'latitude',
 'longitude',
 'individual_order']

In [51]:
keep_cols = utils.keep_cols
remove_cols = ['file_type', 'project', 'clip_url', 'media_url']
for i in remove_cols:
    keep_cols.remove(i)

ValueError: list.remove(x): x not in list

In [52]:
osfls[keep_cols].head()

Unnamed: 0,organization,project_id,location_id,recording_id,recording_date_time,species_code,species_common_name,detection_time,task_duration,tag_duration,tag_id,recording_url,task_method,latitude,longitude,individual_order
9789,BU,787,98510,255472,2016-06-16 04:12:00,OSFL,Olive-sided Flycatcher,25.46,60s,1.25,2269294.0,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,,57.443536,-111.496876,1.0
9798,BU,787,98531,255412,2016-07-01 04:57:55,OSFL,Olive-sided Flycatcher,30.03,60s,1.04,2265662.0,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,,57.523776,-111.41889,1.0
9824,BU,787,98505,255913,2017-06-28 04:00:00,OSFL,Olive-sided Flycatcher,3.41,60s,1.56,2294310.0,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,,57.522751,-111.40527,1.0
9825,BU,787,98505,255913,2017-06-28 04:00:00,OSFL,Olive-sided Flycatcher,19.29,60s,1.4,2294316.0,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,,57.522751,-111.40527,1.0
9826,BU,787,98505,255913,2017-06-28 04:00:00,OSFL,Olive-sided Flycatcher,25.9,60s,1.33,2294318.0,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,,57.522751,-111.40527,1.0


In [53]:
osfl_ids = osfl_songs.recording_id.unique()
len(osfl_ids)

39

In [54]:
df_raw = pd.read_csv(data_path / 'raw' / 'TrainingData_BU&Public_CWS_with_rec_links.csv')
df_raw.columns

  df_raw = pd.read_csv(data_path / 'raw' / 'TrainingData_BU&Public_CWS_with_rec_links.csv')


Index(['organization', 'project', 'project_id', 'location', 'location_id',
       'recording_date_time', 'recording_id', 'task_method', 'task_id',
       'aru_task_status', 'species_code', 'species_common_name',
       'species_scientific_name', 'species_class', 'detection_time',
       'task_duration', 'tag_duration', 'min_tag_freq', 'max_tag_freq',
       'tag_id', 'individual_order', 'vocalization', 'abundance', 'tag_rating',
       'tag_is_verified', 'clip_channel_used', 'observer', 'observer_id',
       'verifier_id', 'left_full_freq_tag_rms_peak_dbfs',
       'left_full_freq_tag_rms_trough_dbfs', 'left_full_freq_tag_pk_count',
       'left_full_freq_tag_dc_offset', 'left_full_freq_tag_min_level',
       'left_full_freq_tag_max_level', 'left_full_freq_tag_peak_level_dbfs',
       'left_freq_filter_tag_rms_peak_dbfs',
       'left_freq_filter_tag_rms_trough_dbfs', 'left_freq_filter_tag_pk_count',
       'left_freq_filter_tag_dc_offset', 'left_freq_filter_tag_min_level',
       'lef

In [60]:
df_raw_ids = df_raw.recording_id.unique()

In [56]:
# find the intersection of the two sets
common_ids = set(osfl_ids).intersection(set(df_raw_ids))

In [61]:
len(common_ids), len(set(osfl_ids))

(39, 39)

In [62]:
osfls.columns

Index(['organization', 'project_id', 'location', 'location_id',
       'location_buffer_m', 'longitude', 'latitude', 'equipment_make',
       'equipment_model', 'recording_id', 'recording_date_time', 'task_id',
       'aru_task_status', 'task_duration', 'task_method', 'species_code',
       'species_common_name', 'species_scientific_name', 'individual_order',
       'tag_id', 'individual_count', 'vocalization', 'detection_time',
       'tag_duration', 'rms_peak_dbfs', 'tag_is_verified', 'tag_rating',
       'observer', 'observer_id', 'species_individual_comments',
       'task_comments', 'wildtrax_url', 'recording_url', 'target'],
      dtype='object')

# Re-group the test dataframe by recording id and aggregate the detection times

In [63]:
osfl_songs.groupby('recording_id').agg({'detection_time': lambda x: list(x), 'tag_duration': lambda x: list(x) })

Unnamed: 0_level_0,detection_time,tag_duration
recording_id,Unnamed: 1_level_1,Unnamed: 2_level_1
255412,[30.03],[1.04]
255419,"[58.31, 41.78]","[1.29, 1.14]"
255420,[17.21],[0.83]
255421,"[54.48, 40.61, 11.75, 31.54]","[1.34, 1.61, 0.98, 1.28]"
255431,"[55.16, 17.14]","[0.86, 0.78]"
255432,"[0.21, 30.76, 12.28]","[1.69, 0.95, 1.71]"
255433,"[24.33, 4.36]","[1.08, 1.5]"
255449,"[30.21, 17.39, 58.85, 6.19, 40.53]","[1.1, 1.09, 1.14, 1.46, 1.19]"
255454,"[58.58, 11.33, 18.9, 26.44, 25.25, 30.2, 34.08...","[0.91, 1.15, 1.09, 1.05, 1.09, 1.09, 1.04, 1.2..."
255456,[2.81],[0.88]


Conclusion: 

The test set contains recordings from the training data, so these will need to be removed if this data is used for training - otherwise the model will be tested on examples it has already been trained on and we won't see how the model generalises to new data.

It looks as though this is a fully annotated set of recordings - but I'll need to check this. 