# Initial data analysis

In [None]:
# import necessary libs
import os
%matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
from ydata_profiling import ProfileReport

In [None]:
DATASET_BASE_FILE_PATH = r"D:\Datasets\birdclef-2023"
TRAIN_SET_FILE_DIR = r"\train_audio"
TEST_SET_FILE_DIR = r"\test_soundscapes"

In [None]:
print(os.listdir(DATASET_BASE_FILE_PATH))

In [None]:
print(os.listdir(DATASET_BASE_FILE_PATH + TRAIN_SET_FILE_DIR))
print(f"Number of test samples: {len(os.listdir(DATASET_BASE_FILE_PATH + TRAIN_SET_FILE_DIR))}")

In [None]:
print(os.listdir(DATASET_BASE_FILE_PATH + TEST_SET_FILE_DIR))

## Analysis of .csv files in base dir
- sample_submission.csv
- eBird_Taxonomy_v2021.csv
- train_metadata.csv

In [None]:
# let's start with the taxonomy csv file
taxonomy = pd.read_csv(DATASET_BASE_FILE_PATH + "\\eBird_Taxonomy_v2021.csv")

print(list(taxonomy.columns))
print(list(taxonomy.dtypes))

print(f"Shape of dataframe (rows, columns): {taxonomy.shape}")

# percentage of NANs in each column
print(taxonomy.isnull().sum(axis = 0)/taxonomy.shape[0])

# Unique values per column
print(taxonomy.nunique())

In [None]:
print(taxonomy.head(3))
print(taxonomy.sample(3))

In [None]:
# closer look into category column
print(list(set(taxonomy['CATEGORY'].values)))

In [None]:
# let's have a look at the pandas profiling report
profile_taxonomy = ProfileReport(taxonomy, title="Pandas Profiling Report - Taxonomy")
profile_taxonomy.to_notebook_iframe()

### Results of taxonomy csv
- 9 columns, ca. 17k rows
- Littel non-NAN entires in SPECIES_GROUP and REPORT_AS
- SPECIES_CODE could be used to get more infomration from https://ebird.org/species/SPECIES_CODE
- Everything else is a black box for me currently

#### Open questions
- What exactly is the TAXON_ORDER?
- What are the categories in the CATEGORY column besides species?
- How to use this taxonomy information?
    - To combine / seperate data of species living close to each other?

In [None]:
# Analysis of train metadata csv
train_metadata = pd.read_csv(DATASET_BASE_FILE_PATH + "\\train_metadata.csv")

print(list(train_metadata.columns))
print(list(train_metadata.dtypes))

print(f"Shape of dataframe (rows, columns): {train_metadata.shape}")

# percentage of NANs in each column
print(train_metadata.isnull().sum(axis = 0)/train_metadata.shape[0])

# Unique values per column
print(train_metadata.nunique())

In [None]:
print(train_metadata.head(3))
print(train_metadata.sample(3))

In [None]:
# let's have a look at the pandas profiling report
profile_train_metadata = ProfileReport(train_metadata, title="Pandas Profiling Report - Train metadata")
profile_train_metadata.to_notebook_iframe()

### Intermediate results
- 16941 rows, 12 columns
- Little NAN entries accross all columns

#### Open questions
- How to use the extra information?
    - As far as I understood it we do NOT have such metadata in the inference case
- Scientific name the same as in the taxonomy file? --> We could join on this column

## Analysis of train set files

In [None]:
list_of_dirs_in_train_dir = os.listdir(DATASET_BASE_FILE_PATH + TRAIN_SET_FILE_DIR)
number_of_files_for_single_sample = []

for single_dir in list_of_dirs_in_train_dir:
    number_of_files = len(os.listdir(DATASET_BASE_FILE_PATH + TRAIN_SET_FILE_DIR + "\\" + single_dir))
    number_of_files_for_single_sample.append(number_of_files)

print(number_of_files_for_single_sample)
print(
    f"Max. samples: {min(number_of_files_for_single_sample)}, \
    Min. samples: {max(number_of_files_for_single_sample)}, \
    Mean samples: {sum(number_of_files_for_single_sample)/len(number_of_files_for_single_sample)}"
)

In [None]:
# plot the distribution per training samples to get a feeling for the balance of the training set
plt.bar(list_of_dirs_in_train_dir, number_of_files_for_single_sample)
plt.xticks(rotation=45)
plt.show()

In [None]:
# plot bars after sorting and the cummulative sum in one plot
number_of_files_for_single_sample_sorted, list_of_dirs_in_train_dir_sorted = map(list, zip(*sorted(zip(number_of_files_for_single_sample, list_of_dirs_in_train_dir), reverse=True)))

cum_sum_samples = np.cumsum(number_of_files_for_single_sample_sorted)
total_file_sum = sum(number_of_files_for_single_sample_sorted)
cum_sum_samples = np.divide(cum_sum_samples, np.repeat(total_file_sum, len(cum_sum_samples)))

fig, ax1 = plt.subplots() 
ax1.set_xlabel('Train samples') 
ax1.set_ylabel('Cumulative sum', color = 'red') 
ax1.plot(list_of_dirs_in_train_dir_sorted, cum_sum_samples, color = 'red') 
ax1.tick_params(axis ='y', labelcolor = 'red') 

ax2 = ax1.twinx()
ax2.set_ylabel('Samples', color = 'blue') 
ax2.bar(list_of_dirs_in_train_dir_sorted, number_of_files_for_single_sample_sorted, color = 'blue') 
ax2.tick_params(axis ='y', labelcolor = 'blue')

plt.show()

### Intermediate results

- Training dataset is skewed
    - Long tail of training classes with less than 10 examples. Even a few training classes with only <b>one</b> samples
    - Maximum samples of training classes is 500
- <b>Need to account for skewness</b>
    - Important for training/validation split --> stratification needed if data is used as is
    - Downsampling might not be a good idea as we throw away up to 499 samples of some classes
    - How do we upsample?