# Data Preproccesing for CNN Model

## Data Downloader from iNaturlist

In [None]:
import pandas as pd
import requests
import os
from urllib.parse import urlparse
from tqdm import tqdm  # Import tqdm for the progress bar

# Read the CSV file
df = pd.read_csv('../Datasets/Inat/observations-all-recordings.csv/observations-598206.csv')

# Create a directory to save the sound files
os.makedirs('frog_sounds', exist_ok=True)

# Helper function to generate a unique filename
def get_unique_filename(directory, base_name, extension):
    full_path = os.path.join(directory, base_name + extension)
    counter = 1
    while os.path.exists(full_path):
        full_path = os.path.join(directory, f"{base_name}_{counter}{extension}")
        counter += 1
    return full_path

# Create a progress bar for the total number of downloads
with tqdm(total=len(df), desc="Downloading frog sounds") as pbar:
    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        sound_url = row['sound_url']
        common_name = row['common_name']
        
        if pd.notna(sound_url) and pd.notna(common_name):
            try:
                # Extract the file extension from the URL
                parsed_url = urlparse(sound_url)
                filename = os.path.basename(parsed_url.path)
                extension = os.path.splitext(filename)[1]
                
                # Clean the common name to make it a valid filename
                clean_name = "".join(c if c.isalnum() or c in (' ', '-') else '_' for c in common_name).strip()
                clean_name = clean_name.replace(' ', '_')  # Optional: replace spaces with underscores
                
                # Generate a unique filename
                unique_filename = get_unique_filename('frog_sounds', clean_name, extension)
                
                # Download the file
                response = requests.get(sound_url, stream=True)
                if response.status_code == 200:
                    with open(unique_filename, 'wb') as f:
                        for chunk in response.iter_content(1024):
                            f.write(chunk)
                    # print(f"Downloaded: {unique_filename}")
                else:
                    print(f"Failed to download: {sound_url}")
            except Exception as e:
                print(f"Error processing {sound_url}: {e}")

            # Update the progress bar after each download
            pbar.update(1)

print("Download complete.")

## Convert all files into .WAV


In [7]:
import os
import subprocess
from pathlib import Path
from tqdm import tqdm

def convert_to_spectrogram_ready(input_dir, output_dir, sample_rate=22050, mono=True):
    
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    audio_files = []
    extensions = ('.wav', '.mp3', '.mp4', '.m4a', '.aac', '.flac', '.ogg', '.mpga')
    
    for root, _, files in os.walk(input_dir):
        for file in files:
            # if file.lower().endswith(extensions):
                audio_files.append(Path(root) / file)
    
    print(f"Found {len(audio_files)} audio files to process")
    
    for input_file in tqdm(audio_files, desc="Converting for spectrograms"):
        output_file = Path(output_dir) / f"{input_file.stem}.wav"
        
        # Check if file exists and modify name if needed
        counter = 1
        while output_file.exists():
            output_file = Path(output_dir) / f"{input_file.stem}_{counter}.wav"
            counter += 1
        
        cmd = [
            'ffmpeg',
            '-i', str(input_file),
            '-ac', '1' if mono else '2',  # mono recommended for spectrograms
            '-ar', str(sample_rate),  # 22050 is standard for many ML models
            '-sample_fmt', 's16',  # 16-bit PCM
            '-acodec', 'pcm_s16le',  # Standard WAV format
            '-y',  # Still overwrites if same name, but we prevent that with the loop
            str(output_file)
        ]
        
        try:
            subprocess.run(cmd, check=True, 
                         stdout=subprocess.DEVNULL, 
                         stderr=subprocess.PIPE)
        except subprocess.CalledProcessError as e:
            print(f"\nError processing {input_file}: {e.stderr.decode('utf-8')}")
    
    print(f"\nConversion complete. Files ready for spectrogram generation in {output_dir}")

if __name__ == "__main__":
    convert_to_spectrogram_ready(
        input_dir="../Datasets/Raw-audio/none-frog-audio/",
        output_dir="../Datasets/Raw-audio/none-frog-audio/"
    )

Found 4 audio files to process


Converting for spectrograms: 100%|██████████| 4/4 [00:03<00:00,  1.29it/s]


Conversion complete. Files ready for spectrogram generation in ../Datasets/Raw-audio/none-frog-audio/





## Rename files for only ID 4 most common frogs

In [None]:
import os
from pathlib import Path
from collections import defaultdict

def rename_frog_files(directory):
    """
    Renames frog audio files according to specific rules:
    - Keeps original names for Mountain_Rain_Frog, Southern_Dainty_Frog, Clicking_Stream_Frog
    - Renames others to other_frog_X.wav with incrementing X
    """
    # Files to keep original names
    KEEP_NAMES = {
        "mountain_rain_frog",
        "southern_dainty_frog",
        "clicking_stream_frog"
    }
    
    # Initialize counter for other_frog files
    other_counter = 1
    # Dictionary to track original name to new name mappings (for avoiding duplicates)
    name_mapping = defaultdict(str)
    
    # Get all .wav files in directory
    for filepath in Path(directory).glob("*.wav"):
        original_name = filepath.stem.lower()  # Get filename without extension
        
        # Check if we should keep original name
        should_keep = any(
            keep_name in original_name 
            for keep_name in KEEP_NAMES
        )
        
        if should_keep:
            continue  # Skip renaming
        
        # Generate new name
        new_name = f"other_frog_{other_counter}.wav"
        new_path = filepath.with_name(new_name)
        
        # Handle potential name conflicts
        while new_path.exists():
            other_counter += 1
            new_name = f"other_frog_{other_counter}.wav"
            new_path = filepath.with_name(new_name)
        
        # Rename the file
        filepath.rename(new_path)
        print(f"Renamed: {filepath.name} -> {new_name}")
        other_counter += 1

if __name__ == "__main__":
    directory_path = "./frog_sounds_wav/"  # Change to your directory
    rename_frog_files(directory_path)
    print("File renaming complete!")

## Get file Info

In [9]:
import os
from pathlib import Path
import wave
import contextlib
from collections import defaultdict

def analyze_short_frog_files(directory, max_duration=4):
    """
    Finds all WAV files shorter than specified duration and counts by frog type.
    
    Args:
        directory (str): Path to directory containing WAV files
        max_duration (float): Maximum duration in seconds to consider as "short"
        
    Returns:
        tuple: (list of short files with durations, frog type counts, total count)
    """
    short_files = []
    frog_counts = defaultdict(int)
    special_frogs = {
        'mountain_rain_frog',
        'southern_dainty_frog',
        'clicking_stream_frog'
    }

    for filepath in Path(directory).glob('*.wav'):
        try:
            with contextlib.closing(wave.open(str(filepath), 'r')) as wav_file:
                frames = wav_file.getnframes()
                rate = wav_file.getframerate()
                duration = frames / float(rate)
                
                if duration < max_duration:
                    filename = filepath.stem.lower()
                    
                    # Determine frog type
                    frog_type = "other"
                    for special in special_frogs:
                        if special in filename:
                            frog_type = special.replace('_', ' ').title()
                            break
                    
                    short_files.append((filepath.name, duration, frog_type))
                    frog_counts[frog_type] += 1
                    
        except (wave.Error, EOFError) as e:
            print(f"Error processing {filepath.name}: {str(e)}")
            continue
    
    return short_files, frog_counts, len(short_files)

if __name__ == "__main__":
    folder_path = "./frog_sounds_wav/"  # Replace with your folder path
    try:
        short_files, frog_counts, total_count = analyze_short_frog_files(folder_path)
        
        if not short_files:
            print("No files shorter than 5 seconds found.")
        else:
            print("Files shorter than 5 seconds:")
            print("-" * 65)
            print(f"{'Filename':<40} {'Duration':<10} {'Frog Type':<15}")
            print("-" * 65)
            for filename, duration, frog_type in short_files:
                print(f"{filename:<40} {duration:.2f}s{'':<5} {frog_type:<15}")
            
            print("-" * 65)
            print("\nFrog Type Counts:")
            for frog_type, count in sorted(frog_counts.items(), key=lambda x: x[1], reverse=True):
                print(f"{frog_type:<20}: {count}")
            
            print("-" * 65)
            print(f"Total files shorter than 5 seconds: {total_count}")
            
    except Exception as e:
        print(f"Error: {str(e)}")

Files shorter than 5 seconds:
-----------------------------------------------------------------
Filename                                 Duration   Frog Type      
-----------------------------------------------------------------
other_frog_1818.wav                      3.01s      other          
other_frog_3281.wav                      2.40s      other          
other_frog_2736.wav                      2.53s      other          
Clicking_Stream_Frog_191.wav             3.76s      Clicking Stream Frog
other_frog_269.wav                       3.16s      other          
other_frog_111.wav                       2.03s      other          
other_frog_661.wav                       3.37s      other          
other_frog_962.wav                       3.99s      other          
other_frog_1705.wav                      1.08s      other          
other_frog_523.wav                       1.19s      other          
other_frog_3282.wav                      1.21s      other          
other_frog_2594.w

In [11]:
import os
import csv
from collections import defaultdict
from pathlib import Path

def analyze_frog_recordings(directory_path, min_threshold=100000):
    """
    Analyze frog recordings to find species with few recordings.
    
    Args:
        directory_path: Path to directory containing audio files
        min_threshold: Minimum number of recordings to not be considered "few"
    
    Returns:
        dict: {frog_species: count} for species with fewer than min_threshold recordings
    """
    # Initialize dictionary to store counts
    frog_counts = defaultdict(int)
    
    # Supported audio file extensions
    audio_extensions = {'.wav'}
    
    # Count recordings per frog species
    for file in Path(directory_path).iterdir():
        if file.suffix.lower() in audio_extensions:
            # Extract frog species name (handling formats like "common_name_number" or "common_name_number_primary")
            frog_species = file.stem.split('_', 1)[0]
            frog_counts[frog_species] += 1
            
            # # Case 1: common_name_number
            # if len(parts) >= 2 and parts[-1].isdigit():
            #     frog_species = '_'.join(parts[:-1])
            #     frog_counts[frog_species] += 1
            # # Case 2: common_name_number_primary/secondary
            # elif len(parts) == 3 and parts[1].isdigit() and parts[2] in ['primary', 'secondary']:
            #     frog_species = parts[0]
            #     frog_counts[frog_species] += 1

            # print(frog_species)
    
    # Filter for species with few recordings
    few_recordings = {
        species: count 
        for species, count in frog_counts.items() 
        # if count < min_threshold
    }
    
    return few_recordings

def print_frog_report(few_recordings, output_csv=None):
    """Print a formatted report of frogs with few recordings and optionally save to CSV."""
    if not few_recordings:
        print("All frog species have sufficient recordings (10 or more each).")
        if output_csv:
            with open(output_csv, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(["Species", "Recording Count"])
                writer.writerow(["All species have sufficient recordings", ""])
        return
    
    # Print console report
    print("Frog species with fewer than 10 recordings:")
    print("-" * 45)
    for species, count in sorted(few_recordings.items(), key=lambda item: (item[1], item[0])):
        print(f"{species.ljust(30)}: {count} recording{'s' if count != 1 else ''}")
    print("-" * 45)
    print(f"Total under-represented species: {len(few_recordings)}")
    
    # Save to CSV if requested
    if output_csv:
        with open(output_csv, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Species", "Recording Count"])
            for species, count in sorted(few_recordings.items(), key=lambda item: (item[1], item[0])):
                writer.writerow([species, count])


# Example usage
if __name__ == "__main__":
    directory_path = "../Datasets/clipped-4s/"  # Change this to your directory
    # output_csv = "frog_recording_report.csv"  # Output CSV filename
    few_recordings = analyze_frog_recordings(directory_path, min_threshold=10)
    print_frog_report(few_recordings)

Frog species with fewer than 10 recordings:
---------------------------------------------
none                          : 876 recordings
Clicking                      : 939 recordings
Southern                      : 1583 recordings
Mountain                      : 2687 recordings
other                         : 6878 recordings
---------------------------------------------
Total under-represented species: 5


## Clip files to 4 seconds

In [None]:
import os
import numpy as np
from pathlib import Path
from scipy.io import wavfile

def process_audio_files(input_dir, output_dir="../Datasets/clipped_4sec/"):
    """
    Process mono audio files:
    - For files >4 seconds: create 4-second clip centered at max amplitude
    - For files <=4 seconds: copy as-is to output directory
    - All files saved to output directory
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    
    output_path.mkdir(exist_ok=True)
    
    processed_count = 0
    clipped_count = 0
    copied_count = 0
    
    for filepath in input_path.glob('*.wav'):
        try:
            # Read WAV file (mono)
            sample_rate, data = wavfile.read(filepath)
            
            # Ensure mono audio
            if len(data.shape) > 1:
                print(f"Skipping {filepath.name} - stereo audio not supported")
                continue
            
            duration = len(data) / sample_rate
            output_file = output_path / filepath.name
            
            if duration <= 4.0:
                # For short files, just copy to output directory
                wavfile.write(output_file, sample_rate, data)
                copied_count += 1
                print(f"Copied: {filepath.name} ({duration:.2f}s)")
            else:
                # Find point of max amplitude
                abs_data = np.abs(data)
                
                # Find all peaks sorted by amplitude
                
                sorted_indices = np.argsort(abs_data)
                max_idx = sorted_indices[-1]
                
                # Calculate 4-second exclusion zone around max peak
                
                half_window = int(2 * sample_rate)
                exclusion_start = max(0, max_idx - half_window)
                exclusion_end = min(len(data), max_idx + half_window)
                
                # Create mask for valid regions (outside exclusion zone)
                
                valid_mask = np.ones(len(data), dtype=bool)
                valid_mask[exclusion_start:exclusion_end] = False
                
                # Find secondary peak outside exclusion zone
                
                if np.any(valid_mask):
                    
                    # Create modified data where excluded region is set to minimum value
                    
                    modified_data = abs_data.copy()
                    modified_data[~valid_mask] = 0
                    second_max_idx = np.argmax(modified_data)
    
                    # Only accept if secondary peak is at least 10% as loud as primary
                    
                    if abs_data[second_max_idx] < 0.1 * abs_data[max_idx]:
                        second_max_idx = None
                
                else:
                    second_max_idx = None

                # Create 4-second clip around primary peak
                start = max(0, max_idx - half_window)
                end = min(len(data), max_idx + half_window)

                # Adjust if window exceeds file length
                
                if end - start < 4 * sample_rate:
                    if start == 0:
                        end = min(len(data), 4 * sample_rate)
                    else:
                        start = max(0, len(data) - 4 * sample_rate)

                clipped_data = data[start:end]

                # Optional: Create secondary clip if valid secondary peak exists
                
                if second_max_idx is not None:
                    sec_start = max(0, second_max_idx - half_window)
                    sec_end = min(len(data), second_max_idx + half_window)
    
                    # Adjust secondary clip window if needed
                    
                    if sec_end - sec_start < 4 * sample_rate:
                        if sec_start == 0:
                            sec_end = min(len(data), 4 * sample_rate)
                        else:
                            sec_start = max(0, len(data) - 4 * sample_rate)
    
                    secondary_clip = data[sec_start:sec_end]
                else:
                    secondary_clip = None
                
                # Write primary clip
                primary_clip = data[max(0, max_idx - 2*sample_rate):min(len(data), max_idx + 2*sample_rate)]
                if len(primary_clip) < 4 * sample_rate:
                    pad_before = (4 * sample_rate - len(primary_clip)) // 2
                    pad_after = (4 * sample_rate - len(primary_clip)) - pad_before
                    primary_clip = np.pad(primary_clip, (pad_before, pad_after), mode='constant')
                wavfile.write(f"{output_file.stem}_primary.wav", sample_rate, primary_clip)

                # Write secondary clip if valid
                if second_max_idx is not None:
                    secondary_clip = data[max(0, second_max_idx - 2*sample_rate):min(len(data), second_max_idx + 2*sample_rate)]
                    if len(secondary_clip) < 4 * sample_rate:
                        pad_before = (4 * sample_rate - len(secondary_clip)) // 2
                        pad_after = (4 * sample_rate - len(secondary_clip)) - pad_before
                        secondary_clip = np.pad(secondary_clip, (pad_before, pad_after), mode='constant')
                    wavfile.write(f"{output_file.stem}_secondary.wav", sample_rate, secondary_clip)
                    processed_count += 1
                    clipped_count += 1
                
            processed_count += 1
            clipped_count += 1
            
        except Exception as e:
            print(f"Error processing {filepath.name}: {str(e)}")
            continue
    
    print("\nProcessing complete!")
    print(f"Processed {processed_count} files")
    print(f"- {clipped_count} files clipped to 4 seconds")
    print(f"- {copied_count} files copied as-is (<=4 seconds)")
    print(f"Output directory: {output_path.resolve()}")

if __name__ == "__main__":
    input_directory = "../Datasets/Raw-audio/frog_sounds_wav/" 
    process_audio_files(input_directory)

Copied: other_frog_1818.wav (3.01s)
Copied: other_frog_3281.wav (2.40s)
Copied: other_frog_2736.wav (2.53s)
Copied: Clicking_Stream_Frog_191.wav (3.76s)
Copied: other_frog_269.wav (3.16s)
Copied: other_frog_111.wav (2.03s)
Copied: other_frog_661.wav (3.37s)
Copied: other_frog_962.wav (3.99s)
Copied: other_frog_1705.wav (1.08s)
Copied: other_frog_523.wav (1.19s)
Copied: other_frog_3282.wav (1.21s)
Copied: other_frog_2594.wav (0.40s)
Copied: other_frog_2158.wav (2.09s)
Copied: other_frog_2325.wav (3.40s)
Copied: other_frog_44.wav (2.56s)
Copied: other_frog_1145.wav (3.75s)
Copied: other_frog_2582.wav (3.43s)
Copied: other_frog_3202.wav (3.78s)
Copied: other_frog_3169.wav (3.41s)
Copied: other_frog_2925.wav (3.05s)
Copied: other_frog_1180.wav (3.36s)
Copied: other_frog_780.wav (3.72s)
Copied: other_frog_2543.wav (2.19s)
Copied: Clicking_Stream_Frog_59.wav (3.60s)
Copied: other_frog_2133.wav (1.01s)
Copied: other_frog_2928.wav (3.76s)
Copied: other_frog_703.wav (3.65s)
Copied: other_frog_2

## Rename files to first word_number

In [10]:
import os
import re
from collections import defaultdict

def rename_wav_files(directory):
    # Get all .wav files in the directory
    wav_files = [f for f in os.listdir(directory) if f.lower().endswith('.wav')]
    
    # Sort the files to ensure consistent ordering
    wav_files.sort()
    
    # Dictionary to keep track of counters for each prefix
    counters = defaultdict(int)
    
    for filename in wav_files:
        # Split the filename at the first underscore
        parts = re.split('_', filename, 1)
        
        # Get the first part (before first underscore) or whole filename if no underscore
        prefix = parts[0]
        
        # Increment the counter for this prefix
        counters[prefix] += 1
        
        # Create new filename
        new_name = f"{prefix}_{counters[prefix]}.wav"
        
        # Full paths for old and new names
        old_path = os.path.join(directory, filename)
        new_path = os.path.join(directory, new_name)
        
        # Rename the file
        try:
            os.rename(old_path, new_path)
            print(f"Renamed: {filename} -> {new_name}")
        except OSError as e:
            print(f"Error renaming {filename}: {e}")

if __name__ == "__main__":
    # Get the directory containing the .wav files
    # dir_path = input("Enter the directory path containing .wav files: ").strip()
    dir_path = "../Datasets/clipped-4s/"
    
    # Verify the directory exists
    if os.path.isdir(dir_path):
        rename_wav_files(dir_path)
        print("Renaming complete!")
    else:
        print("Error: The specified directory does not exist.")

Renamed: Clicking_Stream_Frog_100_primary.wav -> Clicking_1.wav
Renamed: Clicking_Stream_Frog_100_secondary.wav -> Clicking_2.wav
Renamed: Clicking_Stream_Frog_101_primary.wav -> Clicking_3.wav
Renamed: Clicking_Stream_Frog_101_secondary.wav -> Clicking_4.wav
Renamed: Clicking_Stream_Frog_102_primary.wav -> Clicking_5.wav
Renamed: Clicking_Stream_Frog_102_secondary.wav -> Clicking_6.wav
Renamed: Clicking_Stream_Frog_103_primary.wav -> Clicking_7.wav
Renamed: Clicking_Stream_Frog_103_secondary.wav -> Clicking_8.wav
Renamed: Clicking_Stream_Frog_104_primary.wav -> Clicking_9.wav
Renamed: Clicking_Stream_Frog_104_secondary.wav -> Clicking_10.wav
Renamed: Clicking_Stream_Frog_105_primary.wav -> Clicking_11.wav
Renamed: Clicking_Stream_Frog_105_secondary.wav -> Clicking_12.wav
Renamed: Clicking_Stream_Frog_106_primary.wav -> Clicking_13.wav
Renamed: Clicking_Stream_Frog_106_secondary.wav -> Clicking_14.wav
Renamed: Clicking_Stream_Frog_107_primary.wav -> Clicking_15.wav
Renamed: Clicking_St

## Convert all of the audio files into Spectrograms and save as .h5

In [14]:
import os
import numpy as np
import librosa
from joblib import Parallel, delayed
import multiprocessing
import h5py
from tqdm import tqdm

# Configuration
AUDIO_DIR = "../Datasets/clipped-4s/"
OUTPUT_DIR = "../Datasets/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

SAMPLE_RATE = 22050
N_FFT = 2048
HOP_LENGTH = 256
N_MELS = 128

def process_file(file):
    try:
        if file.lower().endswith('.wav'):
            audio_path = os.path.join(AUDIO_DIR, file)
            y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, res_type='kaiser_fast', mono=True)
            S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS)
            S_dB = librosa.power_to_db(S, ref=np.max)
            key = os.path.splitext(file)[0]
            return key, S_dB
    except Exception as e:
        print(f"\nSkipped {file}: {str(e)}")
        return None

# Get list of files first for accurate tqdm progress
files = [f for f in os.listdir(AUDIO_DIR) if f.lower().endswith('.wav')]
print(f"Processing {len(files)} audio files...")


batch_size = 500
for i in tqdm(range(0, len(files), batch_size)):
    batch = files[i:i+batch_size]
    results = Parallel(n_jobs=2)(delayed(process_file)(f) for f in batch)
    with h5py.File(os.path.join(OUTPUT_DIR, 'FD_0.4.h5'), 'a') as hf:  # 'a' for append
        for result in results:
            if result is not None:
                key, S_dB = result
                hf.create_dataset(key, data=S_dB)


print(f"\nDone! Spectrograms saved to {OUTPUT_DIR}")

Processing 12963 audio files...


100%|██████████| 26/26 [01:32<00:00,  3.54s/it]


Done! Spectrograms saved to ../Datasets/



