This Jupyter Notebook is designed to extract unique species names from an existing CSV file containing audio annotations. The annotations include information such as the path to the audio file, recording details, duration, time, frequency ranges, species names, and bounding box coordinates. 

The notebook performs the following tasks:

1. **Data Loading**: It loads the original CSV file containing audio annotations, which includes species names.

2. **Unique Species Extraction**: It extracts unique species names from the loaded CSV file.

3. **Data Transformation**: The unique species names are then stored in a new DataFrame.

4. **Data Saving**: Finally, the DataFrame containing the unique species names is saved to a new CSV file for further analysis or use in other projects.

By executing this notebook, users can efficiently extract and organize unique species names from the audio annotations dataset, facilitating subsequent analysis and data management tasks.

In [6]:
import pandas as pd

In [7]:
# Definir el ROOT_PATH
ROOT_PATH = "../" #"../../../desarrollo/"

# Path al CSV original de las anotaciones de audio
csv_file = ROOT_PATH + "Data/Annotations/audio_annotations.csv"

# Leer el CSV original
df = pd.read_csv(csv_file)

In [3]:
# Get the unique species names
unique_species = df['specie'].unique()

# Create a DataFrame with the unique species names
species_df = pd.DataFrame(unique_species, columns=['Species'])

# Order alphabetically
species_df = species_df.sort_values(by='Species')

# Path to the CSV to save the unique species names
output_csv = "../Data/Annotations/unique_species.csv"

# Save the unique species names to a new CSV
species_df.to_csv(output_csv, index=False)

In [4]:
!cp ../Data/Annotations/unique_species_mapped.csv ../../../desarrollo/Data/Annotations/unique_species_mapped.csv 

'cp' is not recognized as an internal or external command,
operable program or batch file.


In [8]:
# Function to perform species mapping
def map_species(input_file, output_file, species_mapping_file):
    # Get the species mapping DataFrame
    species_mapping_df = pd.read_csv(species_mapping_file)

    if species_mapping_df is None:
        print("Error getting species mapping.")
        return

    # Read the input file
    input_df = pd.read_csv(input_file)

    # Map the species using the mapping DataFrame
    input_df['specie'] = input_df['specie'].map(dict(zip(species_mapping_df['Species'], species_mapping_df['Specie_Name'])))

    # Number of rows of df
    print("Number of annotations: ", len(input_df))

    # Save the output file
    input_df.to_csv(output_file, index=False)

# Usage of the map_species function
input_file = ROOT_PATH + "Data/Annotations/audio_annotations.csv"
output_file = ROOT_PATH + "Data/Annotations/audio_annotations.csv"
species_mapping_file = ROOT_PATH + "Data/Annotations/unique_species_mapped.csv"

map_species(input_file, output_file, species_mapping_file)

Number of annotations:  5778


In [28]:
input_file = ROOT_PATH + "Data/Annotations/audio_annotations.csv"
output_file = ROOT_PATH + "Data/Annotations/audio_annotations.csv"

# Read the input file
df = pd.read_csv(input_file)

# Delete Unknown rows
df = df[df['specie'] != "Unknown"] # Unknown = Bird

# If specie = Quiroptera or Abiotic noise or Insect or No audio, change to No audio and if there are several rows with the same path and all the species of the path are No audio, delete all but one. If there are several rows with the same path and at least one specie is not No audio, delete the No audio rows
# Lista de especies a cambiar a "No audio"
no_audio_species = ["Quiroptera", "Abiotic noise", "Abiotic Noise", "Insect", "No audio"]

# Cambiar las especies a "No audio"
df['specie'] = df['specie'].apply(lambda x: 'No audio' if x in no_audio_species else x)

# Agrupar por 'path' y filtrar
def filter_rows(group):
    if (group['specie'] == 'No audio').all():
        # Si todas las filas en el grupo son "No audio", mantener solo una fila
        return group.iloc[:1]
    else:
        # Si hay al menos una fila que no es "No audio", eliminar las filas "No audio"
        return group[group['specie'] != 'No audio']

# Aplicar el filtrado por grupos de 'path'
df = df.groupby('path', group_keys=False).apply(filter_rows)

# input_df = input_df[input_df['specie'] != "Quiroptera"]
# input_df = input_df[input_df['specie'] != "Abiotic noise"]
# input_df = input_df[input_df['specie'] != "Insect"]

print("Number of annotations: ", len(df))

Number of annotations:  3749


  df = df.groupby('path', group_keys=False).apply(filter_rows)


In [29]:
# Get the species counts
species_counts = df['specie'].value_counts()

# Get the species with less than X samples
X = 10
less_than_10 = species_counts[species_counts < X]

# Map the species with less than 10 samples to "bird"
df['specie'] = df['specie'].apply(lambda x: "Bird" if x in less_than_10 else x)

# unknown is bird
# input_df['specie'] = input_df['specie'].apply(lambda x: "bird" if x == "unknown" else x)

# Save the output file
df.to_csv(output_file, index=False)

# Also save the CSV with name dataset.csv
output_file = ROOT_PATH + "Data/Dataset/CSVs/dataset.csv"
df.to_csv(output_file, index=False)

In [30]:
# Number of rows of df
print("Number of annotations: ", len(df))

Number of annotations:  3749


In [31]:
output_file = ROOT_PATH + "Data/Dataset/CSVs/dataset.csv"

In [35]:
# Take outputfile and print the unique species ordered alphabetically and preceded by an enumeration starting by 0 and :
df = pd.read_csv(output_file)
df['specie'] = df['specie'].astype(str)
# Sort unique species alphabetically
unique_species_sorted = sorted(df['specie'].unique())

# eliminate abiotic noise, unknown and nan
unique_species_sorted = [x for x in unique_species_sorted if x not in ['No audio']]

# Put Bird the first
unique_species_sorted.remove('Bird')
unique_species_sorted.insert(0, 'Bird')

for i, specie in enumerate(unique_species_sorted):
    print(f"{i}: {specie}")

0: Bird
1: Alaudidae
2: Anthus pratensis
3: Athene noctua
4: Calandrella brachydactyla
5: Carduelis carduelis
6: Cettia cetti
7: Chloris chloris
8: Ciconia ciconia
9: Cisticola juncidis
10: Curruca
11: Curruca melanocephala
12: Curruca undata
13: Cyanopica cooki
14: Emberiza calandra
15: Erithacus rubecula
16: Falco tinnunculus
17: Fringilla
18: Galerida Cristata
19: Galerida theklae
20: Hippolais polyglotta
21: Lanius
22: Linaria Cannabina
23: Luscinia megarhynchos
24: Melanocorypha calandra
25: Merops apiaster
26: Milvus migrans
27: Motacilla flava
28: Parus major
29: Passer
30: Pica pica
31: Saxicola rubicola
32: Serinus serinus
33: Streptopelia decaocto
34: Sturnus
35: Sturnus unicolor
36: Sylvia
37: Turdus merula
38: Upupa epops


In [34]:
input_file = ROOT_PATH + "Data/Dataset/CSVs/dataset.csv"

# If specie = No audio -> change bbox to ""
df = pd.read_csv(input_file)
df['bbox'] = df.apply(lambda x: "\"" if x['specie'] == "No audio" else x['bbox'], axis=1)

# Save the output file
df.to_csv(input_file, index=False)