In [None]:
import time
import os

import pandas as pd
import numpy as np
import librosa

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from pydub import AudioSegment
from pydub.utils import mediainfo

from monitoring import *
from audio import *

In [19]:
taxonomy = pd.read_csv('data/taxonomy.csv')
df = pd.read_csv('data/train.csv')

In [20]:
taxonomy

Unnamed: 0,primary_label,inat_taxon_id,scientific_name,common_name,class_name
0,1139490,1139490,Ragoniella pulchella,Ragoniella pulchella,Insecta
1,1192948,1192948,Oxyprora surinamensis,Oxyprora surinamensis,Insecta
2,1194042,1194042,Copiphora colombiae,Copiphora colombiae,Insecta
3,126247,126247,Leptodactylus insularum,Spotted Foam-nest Frog,Amphibia
4,1346504,1346504,Neoconocephalus brachypterus,Neoconocephalus brachypterus,Insecta
...,...,...,...,...,...
201,yehcar1,1432779,Milvago chimachima,Yellow-headed Caracara,Aves
202,yelori1,9352,Icterus nigrogularis,Yellow Oriole,Aves
203,yeofly1,16567,Tolmomyias sulphurescens,Yellow-olive Flycatcher,Aves
204,yercac1,10359,Cacicus cela,Yellow-rumped Cacique,Aves


In [21]:
print(df.columns)
df

Index(['primary_label', 'secondary_labels', 'type', 'filename', 'collection',
       'rating', 'url', 'latitude', 'longitude', 'scientific_name',
       'common_name', 'author', 'license'],
      dtype='object')


Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license
0,1139490,[''],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0
1,1139490,[''],[''],1139490/CSA36389.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0
2,1192948,[''],[''],1192948/CSA36358.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
3,1192948,[''],[''],1192948/CSA36366.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.2800,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
4,1192948,[''],[''],1192948/CSA36373.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28559,ywcpar,[''],[''],ywcpar/iNat77392.ogg,iNat,0.0,https://static.inaturalist.org/sounds/77392.m4a,7.6921,-80.3379,Amazona ochrocephala,Yellow-crowned Parrot,Yennifer Alfaro,cc-by-nc 4.0
28560,ywcpar,[''],[''],ywcpar/iNat78624.ogg,iNat,0.0,https://static.inaturalist.org/sounds/78624.wav,8.9918,-79.4877,Amazona ochrocephala,Yellow-crowned Parrot,Evan Centanni,cc-by-nc-sa 4.0
28561,ywcpar,[''],[''],ywcpar/iNat789234.ogg,iNat,0.0,https://static.inaturalist.org/sounds/789234.wav,9.2316,-70.2041,Amazona ochrocephala,Yellow-crowned Parrot,Henrry,cc-by 4.0
28562,ywcpar,[''],[''],ywcpar/iNat819873.ogg,iNat,0.0,https://static.inaturalist.org/sounds/819873.mp3,10.5838,-66.8545,Amazona ochrocephala,Yellow-crowned Parrot,Alejandro Luy,cc-by-nc 4.0


From here i am going to build a basic function that opens a given file from a filename and extracts information about it from the metadata of the file.

In [22]:
def get_audio_metadata(filename: str) -> dict:
    """
    Extract audio metadata from a file.

    Args:
        filename (str): Path to the audio file as listed in the dataframe.

    Returns:
        dict: A dictionary containing the following metadata:
            - sample_rate (str): The sample rate of the audio file.
            - bit_rate (str): The bit rate of the audio file.
            - sample_fmt (str): The sample format of the audio file.
            - duration (int): The duration of the audio file in milliseconds.
            - num_channels (int): The number of channels in the audio file.
            - file_codec (str): The codec used in the audio file.
            - file_codec_long_name (str): The long name of the codec.
            - array_size (int): The size of the audio sample array for a single channel.
    """
    filename = 'data/train_audio/' + filename
    info = mediainfo(filename)
    sound = AudioSegment.from_file(filename)
    array = np.array(sound.get_array_of_samples())
    meta_data = {
        'sample_rate': info['sample_rate'],
        'bit_rate': info['bit_rate'],
        'sample_fmt': info['sample_fmt'],
        'duration': len(sound),
        'num_channels': sound.channels,
        'file_codec': info['codec_name'],
        'file_codec_long_name': info['codec_long_name'],
        'array_size': len(array)
    }
    return meta_data


In [23]:

get_audio_metadata(df.iloc[0]['filename'])

{'sample_rate': '32000',
 'bit_rate': '62385',
 'sample_fmt': 'fltp',
 'duration': 98853,
 'num_channels': 1,
 'file_codec': 'vorbis',
 'file_codec_long_name': 'Vorbis',
 'array_size': 3163308}

In [24]:
display = None
for i in range(10):
    time.sleep(0.1)
    display = print_progress_bar(i+1, 10, "test", display_handler=display)

In [25]:
columns = df.columns.tolist()
columns.append('sample_rate')
columns.append('bit_rate')
columns.append('sample_fmt')
columns.append('duration')
columns.append('num_channels')
columns.append('file_codec')
columns.append('file_codec_long_name')
columns.append('array_size')
new_df = pd.DataFrame(columns=columns)

length = len(df)
start = time.time()
display = None


if not os.path.exists('exports/train_metadata.csv'):
    for index, row in df.iterrows():
        display = print_progress_bar(index, length, calculate_time_remaining(start, index, length), display)
        filename = row['filename']
        meta_data = get_audio_metadata(filename)
        new_row = {
            'filename': filename,
            'primary_label': row['primary_label'],
            'secondary_labels': row['secondary_labels'],
            'type': row['type'],
            'collection': row['collection'],
            'rating': row['rating'],
            'url': row['url'],
            'latitude': row['latitude'],
            'longitude': row['longitude'],
            'scientific_name': row['scientific_name'],
            'common_name': row['common_name'],
            'author': row['author'],
            'license': row['license'],
            'sample_rate': meta_data['sample_rate'],
            'bit_rate': meta_data['bit_rate'],
            'sample_fmt': meta_data['sample_fmt'],
            'duration': meta_data['duration'],
            'num_channels': meta_data['num_channels'],
            'file_codec': meta_data['file_codec'],
            'file_codec_long_name': meta_data['file_codec_long_name'],
            'array_size': meta_data['array_size']
        }
        new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)
    new_df.to_csv('exports/train_metadata.csv', index=False)
else:
    new_df = pd.read_csv('exports/train_metadata.csv')

In [26]:
new_df

Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license,sample_rate,bit_rate,sample_fmt,duration,num_channels,file_codec,file_codec_long_name,array_size
0,1139490,[''],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,32000,62385,fltp,98853,1,vorbis,Vorbis,3163308
1,1139490,[''],[''],1139490/CSA36389.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,32000,62574,fltp,96538,1,vorbis,Vorbis,3089207
2,1192948,[''],[''],1192948/CSA36358.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,32000,61940,fltp,116600,1,vorbis,Vorbis,3731194
3,1192948,[''],[''],1192948/CSA36366.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.2800,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,32000,62444,fltp,105446,1,vorbis,Vorbis,3374282
4,1192948,[''],[''],1192948/CSA36373.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,32000,62875,fltp,103631,1,vorbis,Vorbis,3316207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28559,ywcpar,[''],[''],ywcpar/iNat77392.ogg,iNat,0.0,https://static.inaturalist.org/sounds/77392.m4a,7.6921,-80.3379,Amazona ochrocephala,Yellow-crowned Parrot,Yennifer Alfaro,cc-by-nc 4.0,32000,69806,fltp,3392,1,vorbis,Vorbis,108544
28560,ywcpar,[''],[''],ywcpar/iNat78624.ogg,iNat,0.0,https://static.inaturalist.org/sounds/78624.wav,8.9918,-79.4877,Amazona ochrocephala,Yellow-crowned Parrot,Evan Centanni,cc-by-nc-sa 4.0,32000,64741,fltp,7660,1,vorbis,Vorbis,245110
28561,ywcpar,[''],[''],ywcpar/iNat789234.ogg,iNat,0.0,https://static.inaturalist.org/sounds/789234.wav,9.2316,-70.2041,Amazona ochrocephala,Yellow-crowned Parrot,Henrry,cc-by 4.0,32000,65910,fltp,7667,1,vorbis,Vorbis,245333
28562,ywcpar,[''],[''],ywcpar/iNat819873.ogg,iNat,0.0,https://static.inaturalist.org/sounds/819873.mp3,10.5838,-66.8545,Amazona ochrocephala,Yellow-crowned Parrot,Alejandro Luy,cc-by-nc 4.0,32000,58547,fltp,27037,1,vorbis,Vorbis,865176


In [27]:
new_df['sample_rate'].value_counts()

sample_rate
32000    28564
Name: count, dtype: int64

In [28]:
new_df['bit_rate'].value_counts()

bit_rate
62880    12
63689    12
63738    11
61896    11
62588    11
         ..
56662     1
58749     1
55016     1
69805     1
59390     1
Name: count, Length: 12464, dtype: int64

In [29]:
new_df['sample_fmt'].value_counts()

sample_fmt
fltp    28564
Name: count, dtype: int64

In [30]:
new_df['file_codec'].value_counts()

file_codec
vorbis    28564
Name: count, dtype: int64

In [31]:
new_df['file_codec_long_name'].value_counts()

file_codec_long_name
Vorbis    28564
Name: count, dtype: int64

In [32]:
freq_bins = get_bark_spaced_frequencies(100, np.__name__) # Get the Bark spaced frequencies
new_columns = {f'{int(freq)}Hz': 0 for freq in freq_bins} # Create a dictionary with the new column names
new_columns_df = pd.DataFrame(new_columns, index=new_df.index) # Create a dataframe with the new columns
linear_df = pd.concat([new_df, new_columns_df], axis=1) # Concatenate the new columns to the dataframe
linear_df.head() # Display the first few rows of the dataframe

Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license,sample_rate,bit_rate,sample_fmt,duration,num_channels,file_codec,file_codec_long_name,array_size,20Hz,44Hz,69Hz,94Hz,119Hz,145Hz,170Hz,195Hz,221Hz,246Hz,272Hz,298Hz,324Hz,351Hz,377Hz,404Hz,432Hz,459Hz,487Hz,516Hz,544Hz,574Hz,603Hz,633Hz,664Hz,695Hz,727Hz,760Hz,793Hz,827Hz,861Hz,897Hz,933Hz,971Hz,1009Hz,1048Hz,1089Hz,1130Hz,1173Hz,1217Hz,1263Hz,1310Hz,1359Hz,1410Hz,1463Hz,1517Hz,1574Hz,1633Hz,1694Hz,1758Hz,1825Hz,1895Hz,1968Hz,2045Hz,2125Hz,2209Hz,2297Hz,2390Hz,2487Hz,2590Hz,2697Hz,2811Hz,2930Hz,3055Hz,3187Hz,3326Hz,3471Hz,3623Hz,3783Hz,3950Hz,4124Hz,4306Hz,4496Hz,4693Hz,4897Hz,5110Hz,5330Hz,5559Hz,5797Hz,6044Hz,6302Hz,6571Hz,6852Hz,7148Hz,7460Hz,7791Hz,8144Hz,8522Hz,8930Hz,9374Hz,9862Hz,10403Hz,11010Hz,11700Hz,12495Hz,13428Hz,14548Hz,15925Hz,17676Hz,19999Hz
0,1139490,[''],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,32000,62385,fltp,98853,1,vorbis,Vorbis,3163308,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1139490,[''],[''],1139490/CSA36389.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,32000,62574,fltp,96538,1,vorbis,Vorbis,3089207,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1192948,[''],[''],1192948/CSA36358.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,32000,61940,fltp,116600,1,vorbis,Vorbis,3731194,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1192948,[''],[''],1192948/CSA36366.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.28,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,32000,62444,fltp,105446,1,vorbis,Vorbis,3374282,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1192948,[''],[''],1192948/CSA36373.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,32000,62875,fltp,103631,1,vorbis,Vorbis,3316207,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


We will eventually be splitting each file into a series of smaller files. In order to ensure we are not overfitting to the noise floor of each file, we will be splitting the dataset into a training and testing set before splitting each file into multiples so that the model has never seen the unique noise floors of the files.

In [33]:
train_linear_df, test_linear_df = train_test_split(linear_df, test_size=0.2) # Split the dataframe into training and testing sets
# reindex the dataframes
train_linear_df = train_linear_df.reset_index(drop=True) # Reset the index of the training dataframe
test_linear_df = test_linear_df.reset_index(drop=True) # Reset the index of the testing dataframe
pd.set_option('display.max_columns', None)
train_linear_df.head() # Display the first few rows of the training dataframe


Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license,sample_rate,bit_rate,sample_fmt,duration,num_channels,file_codec,file_codec_long_name,array_size,20Hz,44Hz,69Hz,94Hz,119Hz,145Hz,170Hz,195Hz,221Hz,246Hz,272Hz,298Hz,324Hz,351Hz,377Hz,404Hz,432Hz,459Hz,487Hz,516Hz,544Hz,574Hz,603Hz,633Hz,664Hz,695Hz,727Hz,760Hz,793Hz,827Hz,861Hz,897Hz,933Hz,971Hz,1009Hz,1048Hz,1089Hz,1130Hz,1173Hz,1217Hz,1263Hz,1310Hz,1359Hz,1410Hz,1463Hz,1517Hz,1574Hz,1633Hz,1694Hz,1758Hz,1825Hz,1895Hz,1968Hz,2045Hz,2125Hz,2209Hz,2297Hz,2390Hz,2487Hz,2590Hz,2697Hz,2811Hz,2930Hz,3055Hz,3187Hz,3326Hz,3471Hz,3623Hz,3783Hz,3950Hz,4124Hz,4306Hz,4496Hz,4693Hz,4897Hz,5110Hz,5330Hz,5559Hz,5797Hz,6044Hz,6302Hz,6571Hz,6852Hz,7148Hz,7460Hz,7791Hz,8144Hz,8522Hz,8930Hz,9374Hz,9862Hz,10403Hz,11010Hz,11700Hz,12495Hz,13428Hz,14548Hz,15925Hz,17676Hz,19999Hz
0,greegr,[''],"['call', 'flight call']",greegr/XC764651.ogg,XC,4.0,https://xeno-canto.org/764651,43.5652,4.5811,Ardea alba,Great Egret,Manceau Lionel,cc-by-nc-sa 4.0,32000,68718,fltp,16056,1,vorbis,Vorbis,513792,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,saffin,[''],['call'],saffin/XC245147.ogg,XC,4.0,https://xeno-canto.org/245147,6.18,-73.42,Sicalis flaveola,Saffron Finch,Oscar Humberto Marín Gómez,cc-by-nc-sa 3.0,32000,63022,fltp,179520,1,vorbis,Vorbis,5744640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,compot1,[''],[''],compot1/iNat1213977.ogg,iNat,0.0,https://static.inaturalist.org/sounds/1213977.wav,-2.9666,-60.7402,Nyctibius griseus,Common Potoo,Unknown,cc-by-nc 4.0,32000,66027,fltp,10000,1,vorbis,Vorbis,320000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,whbman1,[''],['song'],whbman1/XC283827.ogg,XC,5.0,https://xeno-canto.org/283827,-22.4508,-42.7735,Manacus manacus,White-bearded Manakin,Jerome Fischer,cc-by-nc-sa 4.0,32000,59422,fltp,98795,1,vorbis,Vorbis,3161443,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,cregua1,[''],['call'],cregua1/XC56857.ogg,XC,3.0,https://xeno-canto.org/56857,11.012,-74.8801,Penelope purpurascens,Crested Guan,Bernabe Lopez-Lanus,cc-by-nc-sa 3.0,32000,63785,fltp,18051,1,vorbis,Vorbis,577620,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
start = time.time()
total_duration = train_linear_df['duration'].sum() # Calculate the total duration of the training dataframe
processed_duration = 0 # Initialize the processed duration
display = None # Initialize the display variable
new_df = pd.DataFrame(columns=train_linear_df.columns) # Create a new dataframe with the same columns as the training dataframe
if not os.path.exists('exports/train_linear_df.csv'):
    for index, row in train_linear_df.iterrows():
        display = print_progress_bar(processed_duration/3600000, total_duration/3600000, calculate_time_remaining(start, processed_duration, total_duration), display) # Display the progress bar in Hours
        filename = f'data/train_audio/{row['filename']}'
        sound = load_sound_file(filename)
        sounds = split_around_silence(sound)
        short_sounds = []
        for sound in sounds:
            sounds = window_sound(sound, length=5, overlap=2.5)
            for sound in sounds:
                short_sounds.append(sound)
        for sound in short_sounds:
            signal = sound.get_array_of_samples()
            signal = np.array(signal)
            signal = normalize_signal(signal)
            frequencies, freq_signal = fourier_transform(signal, sound.frame_rate)
            frequencies, freq_signal = group_frequencies(frequencies, freq_signal)
            freq_signal = magnitude_to_db(freq_signal)
            freq_signal = normalize_decibels(freq_signal)
            new_row = {
                'filename': row['filename'],
                'primary_label': row['primary_label'],
                'secondary_labels': row['secondary_labels'],
                'type': row['type'],
                'collection': row['collection'],    
                'rating': row['rating'],
                'url': row['url'],
                'latitude': row['latitude'],
                'longitude': row['longitude'],
                'scientific_name': row['scientific_name'],
                'common_name': row['common_name'],
                'author': row['author'],
                'license': row['license']
            }
            for i, freq in enumerate(freq_bins):
                new_row[f'{int(freq)}Hz'] = freq_signal[i]
            new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)
        processed_duration += row['duration']
    new_df.to_csv('exports/train_linear_df.csv', index=False)
    print(f"Processed {processed_duration/3600000} hours of audio in {time.time() - start} seconds")
    with open('exports/train_processing_log.txt', 'w') as file:
        file.write(f"Processed {processed_duration/3600000} hours of audio in {time.time() - start} seconds\n")
else:
    with open('exports/train_processing_log.txt', 'r') as file:
        print(file.read())

  new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)


In [None]:
train_linear_df = pd.read_csv('exports/train_linear_df.csv') # Read the training dataframe from the CSV file
train_linear_df

In [None]:
start = time.time()
total_duration = test_linear_df['duration'].sum() # Calculate the total duration of the training dataframe
processed_duration = 0 # Initialize the processed duration
display = None # Initialize the display variable
new_df = pd.DataFrame(columns=test_linear_df.columns) # Create a new dataframe with the same columns as the training dataframe
if not os.path.exists('exports/test_linear_df.csv'):
    for index, row in test_linear_df.iterrows():
        display = print_progress_bar(processed_duration/3600000, total_duration/3600000, calculate_time_remaining(start, processed_duration, total_duration), display) # Display the progress bar in Hours
        filename = f'data/train_audio/{row['filename']}'
        sound = load_sound_file(filename)
        sounds = split_around_silence(sound)
        short_sounds = []
        for sound in sounds:
            sounds = window_sound(sound, length=5, overlap=2.5)
            for sound in sounds:
                short_sounds.append(sound)
        print(f'Processing {len(short_sounds)} sounds')
        for sound in short_sounds:
            signal = sound.get_array_of_samples()
            signal = np.array(signal)
            signal = normalize_signal(signal)
            frequencies, freq_signal = fourier_transform(signal, sound.frame_rate)
            frequencies, freq_signal = group_frequencies(frequencies, freq_signal)
            freq_signal = magnitude_to_db(freq_signal)
            freq_signal = normalize_decibels(freq_signal)
            new_row = {
                'filename': row['filename'],
                'primary_label': row['primary_label'],
                'secondary_labels': row['secondary_labels'],
                'type': row['type'],
                'collection': row['collection'],    
                'rating': row['rating'],
                'url': row['url'],
                'latitude': row['latitude'],
                'longitude': row['longitude'],
                'scientific_name': row['scientific_name'],
                'common_name': row['common_name'],
                'author': row['author'],
                'license': row['license']
            }
            for i, freq in enumerate(freq_bins):
                new_row[f'{int(freq)}Hz'] = freq_signal[i]
            new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)
        processed_duration += row['duration']
    new_df.to_csv('exports/test_linear_df.csv', index=False)
    print(f"Processed {processed_duration/3600000} hours of audio in {time.time() - start} seconds")
    with open('exports/test_processing_log.txt', 'w') as file:
        file.write(f"Processed {processed_duration/3600000} hours of audio in {time.time() - start} seconds\n")
else:
    with open('exports/test_processing_log.txt', 'r') as file:
        print(file.read())

In [None]:
test_linear_df = pd.read_csv('exports/train_linear_df.csv') # Read the training dataframe from the CSV file
test_linear_df

In [None]:
# Drop all frequency columns above 16kHz
train_linear_df = train_linear_df.drop(columns=[col for col in train_linear_df.columns if col.endswith('Hz') and int(col[:-2]) > 16000])
test_linear_df = test_linear_df.drop(columns=[col for col in test_linear_df.columns if col.endswith('Hz') and int(col[:-2]) > 16000])

# Drop Filename, secondary_labels, type, collection, rating, url, scientific_name, common_name, author, license
train_linear_df = train_linear_df.drop(columns=['filename', 'secondary_labels', 'type', 'collection', 'rating', 'url', 'scientific_name', 'common_name', 'author', 'license'])
test_linear_df = test_linear_df.drop(columns=['filename', 'secondary_labels', 'type', 'collection', 'rating', 'url', 'scientific_name', 'common_name', 'author', 'license'])

In [None]:
# Normalize Longitude and Latitude
train_linear_df['longitude'] = (train_linear_df['longitude'] - train_linear_df['longitude'].min()) / (train_linear_df['longitude'].max() - train_linear_df['longitude'].min())
train_linear_df['latitude'] = (train_linear_df['latitude'] - train_linear_df['latitude'].min()) / (train_linear_df['latitude'].max() - train_linear_df['latitude'].min())
test_linear_df['longitude'] = (test_linear_df['longitude'] - train_linear_df['longitude'].min()) / (train_linear_df['longitude'].max() - train_linear_df['longitude'].min())
test_linear_df['latitude'] = (test_linear_df['latitude'] - train_linear_df['latitude'].min()) / (train_linear_df['latitude'].max() - train_linear_df['latitude'].min())

In [None]:
X_train = train_linear_df.drop(columns=['primary_label']).values
y_train = train_linear_df['primary_label'].values
X_test = test_linear_df.drop(columns=['primary_label']).values
y_test = test_linear_df['primary_label'].values
int8_max = np.iinfo(np.int8).max
X_train = X_train * int8_max
X_test = X_test * int8_max
X_train = X_train.astype(np.int8)
X_test = X_test.astype(np.int8)

In [None]:
# encode the labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_train = y_train.astype(np.int8)


Build a basic model with these parameters

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size: int, output_size: int, hidden_sizes: List[int]):
        """
        Initializes the Multi-Layer Perceptron.

        Args:
            input_size (int): The number of input features.
            hidden_size1 (int): The number of neurons in the first hidden layer.
            hidden_size2 (int): The number of neurons in the second hidden layer.
            output_size (int): The number of output units.
        """
        super(MLP, self).__init__()
        self.layers = nn.ModuleList() # Use ModuleList to hold layers

        # Input layer to first hidden layer (if hidden layers exist)
        if not hidden_sizes:
            # If no hidden layers, connect input directly to output
            self.layers.append(nn.Linear(input_size, output_size))
        else:
            # Input to first hidden layer
            self.layers.append(nn.Linear(input_size, hidden_sizes[0]))
            # Add subsequent hidden layers
            for i in range(len(hidden_sizes) - 1):
                self.layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
            # Last hidden layer to output layer
            self.layers.append(nn.Linear(hidden_sizes[-1], output_size))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Defines the forward pass of the MLP.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor.
        """
        # Iterate through all layers except the last one
        for layer in self.layers[:-1]:
            x = F.relu(layer(x)) # Apply layer and ReLU activation

        # Apply the last layer (output layer) without ReLU
        x = self.layers[-1](x)
        return x


In [None]:
X_train = torch.tensor(X_train, dtype=torch.int8)
X_test = torch.tensor(X_test, dtype=torch.int8)

y_train = torch.tensor(y_train, dtype=torch.int8)

batch_size = 64

train_dataset = TensorDataset(X_train, y_train)
# For test data, we only need features for prediction
test_dataset = TensorDataset(X_test) # No labels needed here

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size * 2, shuffle=False)

In [None]:
input_dim = X_train.shape[1] # Number of features
hidden_dims = [128, 64]
output_dim = len(label_encoder.classes_) # Number of classes
learning_rate = 0.001

In [None]:
model = MLP(input_size=input_dim, hidden_sizes=hidden_dims, output_size=output_dim)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
num_epochs = 10 # Set number of training epochs
device = torch.device('cpu')
model.to(device) # Move model to device (CPU or GPU)

print("Training the model...")
model.train() # Set model to training mode
start = time.time()
display = None

for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):
        features = features.to(device)
        labels = labels.to(device)

        outputs = model(features)
        loss = criterion(outputs, labels)

        optimizer.zero_grad() # Zero the gradients
        loss.backward()
        optimizer.step()

        epoch_loss = loss.item()
    
    avg_epoch_loss = epoch_loss / len(train_loader)
    display = print_progress_bar(epoch, num_epochs, calculate_time_remaining(start, epoch, num_epochs), display, message=f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}")

print(f"Training completed in {time.time() - start:.2f} seconds")


In [None]:
model.eval() # Set model to evaluation mode
all_preds = []

with torch.no_grad():
    for i, (inputs,) in enumerate(test_loader):
        inputs = inputs.to(device)
        outputs = model(inputs)

        _, preds = torch.max(outputs.data, 1)

        all_preds.extend(preds.cpu().numpy())
    

predictions = np.array(all_preds)

predictions = label_encoder.inverse_transform(predictions)
predictions = pd.DataFrame(predictions, columns=['primary_label'])

print(classification_report(y_test, predictions, target_names=label_encoder.classes_))
