In [1]:
import pandas as pd
import random
import csv

In [62]:
def stratify_csv(input_csv_path):
  # Define the column names for the output CSV file
  header = ['AudioMothCode','AudioMothID','SourceFile','Directory','FileName',
          'FileSize','Encoding','NumChannels','SampleRate','AvgBytesPerSec',
          'BitsPerSample','StartDateTime','Duration','Error','Comment','Artist',
          'FileCreateDate','FileType','FileTypeExtension','MIMEType']
  HOURS_IN_DAY = 24
  
  try:
    # Read in the inputted CSV path if it exists. Otherwise return failure
    df = pd.read_csv(input_csv_path)
    # Create an output CSV file to write to
    output_file = open('Stratified_Peru_2019_AudioMoth_Data.csv', 'w')
    writer = csv.writer(output_file)
    writer.writerow(header)

    # Just filter out all the clips that are less than a minute from the data set
    long_AudioMoth_recordings_df = df[df['Duration'] >= 60]

    # Complete the following procedure for each unique AudioMoth device
    for audioMoth_num in range(len(long_AudioMoth_recordings_df['AudioMothCode'].unique())):
      # Make a DataFrame consisting of all clips for a single AudioMoth device
      individual_AudioMoth_df = long_AudioMoth_recordings_df[long_AudioMoth_recordings_df['AudioMothCode'] == long_AudioMoth_recordings_df['AudioMothCode'].unique()[audioMoth_num]].copy()
      individual_AudioMoth_df.reset_index(inplace = True, drop = True)
      
      # Make an Hour column in AudioMoth DataFrame corresponding to what hour of
      # the day the clip is taken, to make filtering DataFrame by Hour easier
      for index in range(individual_AudioMoth_df.shape[0]):
        # We retrieve the hour that the clip was taken by looking at the start
        # time that was noted in the Comment column of the individual record
        individual_AudioMoth_df.loc[index,'Hour'] = int(individual_AudioMoth_df.loc[index, 'Comment'][12:14])
      
      # Check if the AudioMoth DataFrame has at least 24 clips and
      # there are clips for each hour of the day. If so, write data to csv
      if(individual_AudioMoth_df.shape[0] >= 24) & (len(individual_AudioMoth_df['Hour'].unique()) == HOURS_IN_DAY):
          for hour in range(HOURS_IN_DAY):
            clips_per_hour_AudioMoth_df = individual_AudioMoth_df[individual_AudioMoth_df['Hour'] == hour].copy()

            # We don't want to output the Hour column in the CSV file, since it
            # was not part of the original csv, so drop the column from DataFrame
            clips_per_hour_AudioMoth_df.drop('Hour', axis = 1, inplace = True)

            # Pick a random clip and write it to the output csv file
            writer.writerow(clips_per_hour_AudioMoth_df.iloc[random.randint(0, clips_per_hour_AudioMoth_df.shape[0] - 1)])
    
    # Close the file to save our changes
    output_file.close()
  except:
    return False  

  # If we made it here, then no errors occurred
  return True