# Initial data analysis

In [1]:
# import necessary libs
import os
%matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Using matplotlib backend: Qt5Agg


In [2]:
DATASET_BASE_FILE_PATH = r"D:\Datasets\birdclef-2023"
TRAIN_SET_FILE_DIR = r"\train_audio"
TEST_SET_FILE_DIR = r"\test_soundscapes"

In [7]:
print(os.listdir(DATASET_BASE_FILE_PATH))

['eBird_Taxonomy_v2021.csv', 'sample_submission.csv', 'test_soundscapes', 'train_audio', 'train_metadata.csv']


In [10]:
print(os.listdir(DATASET_BASE_FILE_PATH + TRAIN_SET_FILE_DIR))
print(f"Number of test samples: {len(os.listdir(DATASET_BASE_FILE_PATH + TRAIN_SET_FILE_DIR))}")

['abethr1', 'abhori1', 'abythr1', 'afbfly1', 'afdfly1', 'afecuc1', 'affeag1', 'afgfly1', 'afghor1', 'afmdov1', 'afpfly1', 'afpkin1', 'afpwag1', 'afrgos1', 'afrgrp1', 'afrjac1', 'afrthr1', 'amesun2', 'augbuz1', 'bagwea1', 'barswa', 'bawhor2', 'bawman1', 'bcbeat1', 'beasun2', 'bkctch1', 'bkfruw1', 'blacra1', 'blacuc1', 'blakit1', 'blaplo1', 'blbpuf2', 'blcapa2', 'blfbus1', 'blhgon1', 'blhher1', 'blksaw1', 'blnmou1', 'blnwea1', 'bltapa1', 'bltbar1', 'bltori1', 'blwlap1', 'brcale1', 'brcsta1', 'brctch1', 'brcwea1', 'brican1', 'brobab1', 'broman1', 'brosun1', 'brrwhe3', 'brtcha1', 'brubru1', 'brwwar1', 'bswdov1', 'btweye2', 'bubwar2', 'butapa1', 'cabgre1', 'carcha1', 'carwoo1', 'categr', 'ccbeat1', 'chespa1', 'chewea1', 'chibat1', 'chtapa3', 'chucis1', 'cibwar1', 'cohmar1', 'colsun2', 'combul2', 'combuz1', 'comsan', 'crefra2', 'crheag1', 'crohor1', 'darbar1', 'darter3', 'didcuc1', 'dotbar1', 'dutdov1', 'easmog1', 'eaywag1', 'edcsun3', 'egygoo', 'equaka1', 'eswdov1', 'eubeat1', 'fatrav1', 'f

In [6]:
print(os.listdir(DATASET_BASE_FILE_PATH + TEST_SET_FILE_DIR))

['soundscape_29201.ogg']

## Analysis of .csv files in base dir
- sample_submission.csv
- eBird_Taxonomy_v2021.csv
- train_metadata.csv

In [12]:
# let's start with the taxonomy csv file
taxonomy = pd.read_csv(DATASET_BASE_FILE_PATH + "\\eBird_Taxonomy_v2021.csv")

print(list(taxonomy.columns))
print(list(taxonomy.dtypes))

print(f"Shape of dataframe (rows, columns): {taxonomy.shape}")

# percentage of NANs in each column
print(taxonomy.isnull().sum(axis = 0)/taxonomy.shape[0])

# Unique values per column
print(taxonomy.nunique())

['TAXON_ORDER', 'CATEGORY', 'SPECIES_CODE', 'PRIMARY_COM_NAME', 'SCI_NAME', 'ORDER1', 'FAMILY', 'SPECIES_GROUP', 'REPORT_AS']
[dtype('int64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O')]
Shape of dataframe (rows, columns): (16753, 9)
TAXON_ORDER         0.000000
CATEGORY            0.000000
SPECIES_CODE        0.000000
PRIMARY_COM_NAME    0.000000
SCI_NAME            0.000000
ORDER1              0.000119
FAMILY              0.000776
SPECIES_GROUP       0.987107
REPORT_AS           0.768638
dtype: float64
TAXON_ORDER         16753
CATEGORY                8
SPECIES_CODE        16753
PRIMARY_COM_NAME    16753
SCI_NAME            16753
ORDER1                 41
FAMILY                249
SPECIES_GROUP         216
REPORT_AS            1400
dtype: int64


In [9]:
print(taxonomy.head(3))
print(taxonomy.sample(3))

   TAXON_ORDER CATEGORY SPECIES_CODE       PRIMARY_COM_NAME  \
0            1  species      ostric2         Common Ostrich   
1            6  species      ostric3         Somali Ostrich   
2            7    slash       y00934  Common/Somali Ostrich   

                         SCI_NAME            ORDER1  \
0                Struthio camelus  Struthioniformes   
1          Struthio molybdophanes  Struthioniformes   
2  Struthio camelus/molybdophanes  Struthioniformes   

                      FAMILY SPECIES_GROUP REPORT_AS  
0  Struthionidae (Ostriches)     Ostriches       NaN  
1  Struthionidae (Ostriches)           NaN       NaN  
2  Struthionidae (Ostriches)           NaN       NaN  
       TAXON_ORDER CATEGORY SPECIES_CODE                     PRIMARY_COM_NAME  \
12036        24431     issf      marwhi1           Lesser Whitethroat (Gansu)   
8744         16668     issf      bncfly5  Brown-crested Flycatcher (Cooper's)   
7684         14502  species       swfgle         Slaty-winged F

In [15]:
# closer look into category column
print(list(set(taxonomy['CATEGORY'].values)))

['form', 'issf', 'spuh', 'species', 'hybrid', 'slash', 'intergrade', 'domestic']


### Results of taxonomy csv
- 9 columns, ca. 17k rows
- Littel non-NAN entires in SPECIES_GROUP and REPORT_AS
- SPECIES_CODE could be used to get more infomration from https://ebird.org/species/SPECIES_CODE
- Everything else is a black box for me currently

#### Open questions
- What exactly is the TAXON_ORDER?
- What are the categories in the CATEGORY column besides species?
- How to use this taxonomy information?
    - To combine / seperate data of species living close to each other?

In [16]:
# Analysis of train metadata csv
train_metadata = pd.read_csv(DATASET_BASE_FILE_PATH + "\\train_metadata.csv")

print(list(train_metadata.columns))
print(list(train_metadata.dtypes))

print(f"Shape of dataframe (rows, columns): {train_metadata.shape}")

# percentage of NANs in each column
print(train_metadata.isnull().sum(axis = 0)/train_metadata.shape[0])

# Unique values per column
print(train_metadata.nunique())

['primary_label', 'secondary_labels', 'type', 'latitude', 'longitude', 'scientific_name', 'common_name', 'author', 'license', 'rating', 'url', 'filename']
[dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O')]
Shape of dataframe (rows, columns): (16941, 12)
primary_label       0.000000
secondary_labels    0.000000
type                0.000000
latitude            0.013399
longitude           0.013399
scientific_name     0.000000
common_name         0.000000
author              0.000000
license             0.000000
rating              0.000000
url                 0.000000
filename            0.000000
dtype: float64
primary_label         264
secondary_labels      751
type                  796
latitude             6252
longitude            6301
scientific_name       264
common_name           264
author               1082
license                 4
rating                 11
url        

In [17]:
print(train_metadata.head(3))
print(train_metadata.sample(3))

  primary_label secondary_labels      type  latitude  longitude  \
0       abethr1               []  ['song']    4.3906    38.2788   
1       abethr1               []  ['call']   -2.9524    38.2921   
2       abethr1               []  ['song']   -2.9524    38.2921   

      scientific_name               common_name         author  \
0  Turdus tephronotus  African Bare-eyed Thrush  Rolf A. de By   
1  Turdus tephronotus  African Bare-eyed Thrush  James Bradley   
2  Turdus tephronotus  African Bare-eyed Thrush  James Bradley   

                                             license  rating  \
0  Creative Commons Attribution-NonCommercial-Sha...     4.0   
1  Creative Commons Attribution-NonCommercial-Sha...     3.5   
2  Creative Commons Attribution-NonCommercial-Sha...     3.5   

                                 url              filename  
0  https://www.xeno-canto.org/128013  abethr1/XC128013.ogg  
1  https://www.xeno-canto.org/363501  abethr1/XC363501.ogg  
2  https://www.xeno-canto.

### Intermediate results
- 16941 rows, 12 columns
- Little NAN entries accross all columns

#### Open questions
- How to use the extra information?
    - As far as I understood it we do NOT have such metadata in the inference case
- Scientific name the same as in the taxonomy file? --> We could join on this column

## Analysis of train set files

In [25]:
list_of_dirs_in_train_dir = os.listdir(DATASET_BASE_FILE_PATH + TRAIN_SET_FILE_DIR)
number_of_files_for_single_sample = []

for single_dir in list_of_dirs_in_train_dir:
    number_of_files = len(os.listdir(DATASET_BASE_FILE_PATH + TRAIN_SET_FILE_DIR + "\\" + single_dir))
    number_of_files_for_single_sample.append(number_of_files)

print(number_of_files_for_single_sample)
print(
    f"Max. samples: {min(number_of_files_for_single_sample)}, \
    Min. samples: {max(number_of_files_for_single_sample)}, \
    Mean samples: {sum(number_of_files_for_single_sample)/len(number_of_files_for_single_sample)}"
)

[15, 126, 28, 18, 31, 90, 48, 8, 72, 37, 104, 1, 81, 57, 25, 30, 45, 43, 12, 24, 500, 47, 7, 81, 34, 109, 28, 60, 76, 262, 50, 166, 22, 38, 23, 16, 13, 26, 17, 11, 7, 20, 15, 10, 3, 62, 2, 29, 22, 38, 30, 8, 1, 81, 40, 27, 67, 9, 30, 34, 153, 43, 166, 13, 6, 8, 79, 27, 29, 113, 425, 181, 293, 477, 500, 1, 36, 49, 32, 7, 79, 3, 8, 15, 500, 21, 152, 7, 63, 437, 25, 5, 15, 137, 34, 136, 45, 239, 19, 81, 3, 5, 2, 94, 12, 138, 252, 103, 26, 28, 20, 10, 51, 72, 53, 24, 46, 9, 129, 30, 18, 59, 8, 436, 16, 5, 7, 34, 56, 9, 109, 68, 14, 40, 29, 378, 72, 18, 15, 1, 3, 19, 16, 6, 15, 6, 40, 8, 32, 36, 30, 24, 16, 20, 21, 20, 20, 22, 7, 14, 4, 91, 121, 52, 10, 14, 44, 172, 17, 281, 47, 28, 21, 56, 122, 70, 12, 42, 59, 9, 2, 5, 25, 227, 28, 116, 22, 10, 33, 6, 6, 78, 30, 25, 3, 45, 68, 23, 51, 33, 21, 199, 22, 21, 42, 41, 37, 32, 13, 12, 94, 59, 48, 119, 33, 4, 90, 37, 6, 161, 97, 500, 98, 105, 19, 88, 18, 34, 27, 132, 78, 48, 10, 73, 7, 23, 19, 34, 9, 1, 17, 1, 14, 8, 5, 500, 67, 486, 28, 106, 34,

In [24]:
# plot the distribution per training samples to get a feeling for the balance of the training set
plt.bar(list_of_dirs_in_train_dir, number_of_files_for_single_sample)
plt.xticks(rotation=45)
plt.show()

In [38]:
# plot bars after sorting and the cummulative sum in one plot
number_of_files_for_single_sample_sorted, list_of_dirs_in_train_dir_sorted = map(list, zip(*sorted(zip(number_of_files_for_single_sample, list_of_dirs_in_train_dir), reverse=True)))

cum_sum_samples = np.cumsum(number_of_files_for_single_sample_sorted)
total_file_sum = sum(number_of_files_for_single_sample_sorted)
cum_sum_samples = np.divide(cum_sum_samples, np.repeat(total_file_sum, len(cum_sum_samples)))

fig, ax1 = plt.subplots() 
ax1.set_xlabel('Train samples') 
ax1.set_ylabel('Cumulative sum', color = 'red') 
ax1.plot(list_of_dirs_in_train_dir_sorted, cum_sum_samples, color = 'red') 
ax1.tick_params(axis ='y', labelcolor = 'red') 

ax2 = ax1.twinx()
ax2.set_ylabel('Samples', color = 'blue') 
ax2.bar(list_of_dirs_in_train_dir_sorted, number_of_files_for_single_sample_sorted, color = 'blue') 
ax2.tick_params(axis ='y', labelcolor = 'blue')

plt.show()

### Intermediate results

- Training dataset is skewed
    - Long tail of training classes with less than 10 examples. Even a few training classes with only <b>one</b> samples
    - Maximum samples of training classes is 500
- <b>Need to account for skewness</b>
    - Important for training/validation split --> stratification needed if data is used as is
    - Downsampling might not be a good idea as we throw away up to 499 samples of some classes
    - How do we upsample?