# 2. Build a labelled dataframe from a cleaned csv file
- Download recordings to train the model
- Make a dataframe indexed by 3s windows along each recording
- generate target present and absent tags for each window by looking at human labelled clips

In [1]:
from pathlib import Path
import sys
import pandas as pd

BASE_PATH = Path.cwd().parent.parent
data_path = BASE_PATH / "data" 
sys.path.append(str(BASE_PATH / "src" / "data"))

In [3]:
import build

Load the processed data - this is a cleaned version of the WildTrax csv data with an additional column for recording_url, latitude and longitude. 

In [4]:
processed_df = pd.read_pickle(data_path / 'interim' / 'cleaned_metadata.pkl')
processed_df.head()

Unnamed: 0,organization,project,project_id,location,location_id,recording_date_time,recording_id,task_method,task_id,aru_task_status,...,spectrogram_url,clip_url,sensorId,tasks,status,recording_url,latitude,longitude,location_buffer_m,file_type
1623,BU,Alberta Archetypes,1501,P-E0-1-10,308678,2022-06-05 06:51:00,416962,no_restrictions,596169,Transcribed,...,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,ARU,357,Active,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,52.64404,-115.14051,,flac
1752,BU,Amplitude Quality Testing 2020,293,AM-403-SE2,36043,2017-06-15 04:46:00,92051,no_restrictions,87956,Transcribed,...,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,ARU,174,Published - Private,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,54.607774,-110.681271,,flac
1758,BU,Amplitude Quality Testing 2020,293,AM-403-SE2,36043,2017-06-15 04:46:00,92051,no_restrictions,87898,Transcribed,...,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,ARU,174,Published - Private,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,54.607774,-110.681271,,flac
1761,BU,Amplitude Quality Testing 2020,293,AM-403-SE2,36043,2017-06-15 04:46:00,92051,no_restrictions,87840,Transcribed,...,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,ARU,174,Published - Private,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,54.607774,-110.681271,,flac
1764,BU,Amplitude Quality Testing 2020,293,AM-403-SE2,36043,2017-06-15 04:46:00,92051,no_restrictions,87927,Transcribed,...,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,ARU,174,Published - Private,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,54.607774,-110.681271,,flac


If you have an existing test set, you'll want to make sure it doesn't end up in the training data - otherwise the model may be tested on audio it has already seen the labels for.

In [5]:
existing_test_set = pd.read_csv(data_path / 'raw' / "SingleSpecies_all.csv", low_memory=False)

In [8]:
train_and_valid_df, test_df = build.new_labelled_df(processed_df, target_species="OSFL", download_n=0, existing_test_set=existing_test_set, seed=42)

3512 not downloaded
downloading 0 clips
skipped 0 previously downloaded files
dropped 1 locations from training set

--------------------------------------------------
train set
recordings per task method = 
 task_method
1SPT               257
1SPM               115
no_restrictions     11
Name: count, dtype: int64
total recordings = 383

Tags generated from each tagging method:
                 target_present  target_absent
task_method                                   
1SPM                       17.0           98.0
1SPT                       17.0          240.0
no_restrictions             1.0           10.0
total present clips =  35
total absent clips =  348
total available human labelled tags = 383

--------------------------------------------------
valid set
recordings per task method = 
 task_method
1SPT    213
1SPM     19
Name: count, dtype: int64
total recordings = 232

Tags generated from each tagging method:
             target_present  target_absent
task_method                

# Save the test split somewhere out of the way
Don't look at it until after model training and hyperparameter tuning is complete. This is the data the model will be evaluated on after training. 

# Save the training and validation set in a different folder
This is the data the model will be trained and evaluated on during training.

In [23]:
train_and_valid_set_dir = data_path / 'interim' / 'train_and_valid_set'
test_set_dir = data_path / 'interim' / 'test_set'
if not train_and_valid_set_dir.exists():
    Path.mkdir(train_and_valid_set_dir)
if not test_set_dir.exists():
    Path.mkdir(test_set_dir)

In [25]:
train_and_valid_df.to_pickle(data_path / 'interim' / 'train_and_valid_set' / 'train_and_valid_set.pkl')
test_df.to_pickle(data_path / 'interim' / 'test_set' / 'test_set.pkl')

In [26]:
train_and_valid_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,recording_url,task_method,project,detection_time,tag_duration,latitude,longitude,file_type,media_url,individual_order,location_id,filename,target_present,target_absent,is_valid
file,start_time,end_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
../../data/raw/recordings/OSFL/recording-100257.flac,0.0,3.0,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,1SPM,Boreal Wetland Community Monitoring,"[3.49, 22.01, 62.41, 63.6, 121.08, 125.2]","[0.81, 0.78, 0.85, 0.9, 0.81, 0.76]",57.327953,-111.339399,flac,https://portal.wildtrax.ca/home/aru-tasks/reco...,2.0,403,recording-100257.flac,0.0,1.0,False
../../data/raw/recordings/OSFL/recording-100257.flac,1.5,4.5,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,1SPM,Boreal Wetland Community Monitoring,"[3.49, 22.01, 62.41, 63.6, 121.08, 125.2]","[0.81, 0.78, 0.85, 0.9, 0.81, 0.76]",57.327953,-111.339399,flac,https://portal.wildtrax.ca/home/aru-tasks/reco...,2.0,403,recording-100257.flac,1.0,0.0,False
../../data/raw/recordings/OSFL/recording-100257.flac,3.0,6.0,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,1SPM,Boreal Wetland Community Monitoring,"[3.49, 22.01, 62.41, 63.6, 121.08, 125.2]","[0.81, 0.78, 0.85, 0.9, 0.81, 0.76]",57.327953,-111.339399,flac,https://portal.wildtrax.ca/home/aru-tasks/reco...,2.0,403,recording-100257.flac,1.0,0.0,False
../../data/raw/recordings/OSFL/recording-100257.flac,21.0,24.0,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,1SPM,Boreal Wetland Community Monitoring,"[3.49, 22.01, 62.41, 63.6, 121.08, 125.2]","[0.81, 0.78, 0.85, 0.9, 0.81, 0.76]",57.327953,-111.339399,flac,https://portal.wildtrax.ca/home/aru-tasks/reco...,2.0,403,recording-100257.flac,1.0,0.0,False
../../data/raw/recordings/OSFL/recording-100257.flac,61.5,64.5,https://wildtrax-aru.s3.us-west-2.amazonaws.co...,1SPM,Boreal Wetland Community Monitoring,"[3.49, 22.01, 62.41, 63.6, 121.08, 125.2]","[0.81, 0.78, 0.85, 0.9, 0.81, 0.76]",57.327953,-111.339399,flac,https://portal.wildtrax.ca/home/aru-tasks/reco...,2.0,403,recording-100257.flac,1.0,0.0,False
