# Data Preproccesing for CNN Model

## Data Downloader from iNaturlist

In [None]:
import pandas as pd
import requests
import os
from urllib.parse import urlparse
from tqdm import tqdm  # Import tqdm for the progress bar

# Read the CSV file
df = pd.read_csv('../Datasets/Inat/observations-all-recordings.csv/observations-598206.csv')

# Create a directory to save the sound files
os.makedirs('frog_sounds', exist_ok=True)

# Helper function to generate a unique filename
def get_unique_filename(directory, base_name, extension):
    full_path = os.path.join(directory, base_name + extension)
    counter = 1
    while os.path.exists(full_path):
        full_path = os.path.join(directory, f"{base_name}_{counter}{extension}")
        counter += 1
    return full_path

# Create a progress bar for the total number of downloads
with tqdm(total=len(df), desc="Downloading frog sounds") as pbar:
    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        sound_url = row['sound_url']
        common_name = row['common_name']
        
        if pd.notna(sound_url) and pd.notna(common_name):
            try:
                # Extract the file extension from the URL
                parsed_url = urlparse(sound_url)
                filename = os.path.basename(parsed_url.path)
                extension = os.path.splitext(filename)[1]
                
                # Clean the common name to make it a valid filename
                clean_name = "".join(c if c.isalnum() or c in (' ', '-') else '_' for c in common_name).strip()
                clean_name = clean_name.replace(' ', '_')  # Optional: replace spaces with underscores
                
                # Generate a unique filename
                unique_filename = get_unique_filename('frog_sounds', clean_name, extension)
                
                # Download the file
                response = requests.get(sound_url, stream=True)
                if response.status_code == 200:
                    with open(unique_filename, 'wb') as f:
                        for chunk in response.iter_content(1024):
                            f.write(chunk)
                    # print(f"Downloaded: {unique_filename}")
                else:
                    print(f"Failed to download: {sound_url}")
            except Exception as e:
                print(f"Error processing {sound_url}: {e}")

            # Update the progress bar after each download
            pbar.update(1)

print("Download complete.")

## Convert all files into .WAV


In [1]:
import os
import subprocess
from pathlib import Path
from tqdm import tqdm

def convert_to_spectrogram_ready(input_dir, output_dir, sample_rate=22050, mono=True):
    
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    audio_files = []
    extensions = ('.wav', '.mp3', '.mp4', '.m4a', '.aac', '.flac', '.ogg', '.mpga')
    
    for root, _, files in os.walk(input_dir):
        for file in files:
            # if file.lower().endswith(extensions):
                audio_files.append(Path(root) / file)
    
    print(f"Found {len(audio_files)} audio files to process")
    
    for input_file in tqdm(audio_files, desc="Converting for spectrograms"):
        output_file = Path(output_dir) / f"{input_file.stem}.wav"
        
        cmd = [
            'ffmpeg',
            '-i', str(input_file),
            '-ac', '1' if mono else '2',  # mono recommended for spectrograms
            '-ar', str(sample_rate),  # 22050 is standard for many ML models
            '-sample_fmt', 's16',  # 16-bit PCM
            '-acodec', 'pcm_s16le',  # Standard WAV format
            '-y',
            str(output_file)
        ]
        
        try:
            subprocess.run(cmd, check=True, 
                         stdout=subprocess.DEVNULL, 
                         stderr=subprocess.PIPE)
        except subprocess.CalledProcessError as e:
            print(f"\nError processing {input_file}: {e.stderr.decode('utf-8')}")
    
    print(f"\nConversion complete. Files ready for spectrogram generation in {output_dir}")

if __name__ == "__main__":
    convert_to_spectrogram_ready(
        input_dir="frog_sounds/",
        output_dir="./frog_sounds_wav/"
    )

Found 7632 audio files to process


Converting for spectrograms: 100%|██████████| 7632/7632 [08:05<00:00, 15.71it/s]


Conversion complete. Files ready for spectrogram generation in ./frog_sounds_wav/





## Rename files for only ID 4 most common frogs

In [None]:
import os
from pathlib import Path
from collections import defaultdict

def rename_frog_files(directory):
    """
    Renames frog audio files according to specific rules:
    - Keeps original names for Mountain_Rain_Frog, Southern_Dainty_Frog, Clicking_Stream_Frog
    - Renames others to other_frog_X.wav with incrementing X
    """
    # Files to keep original names
    KEEP_NAMES = {
        "mountain_rain_frog",
        "southern_dainty_frog",
        "clicking_stream_frog"
    }
    
    # Initialize counter for other_frog files
    other_counter = 1
    # Dictionary to track original name to new name mappings (for avoiding duplicates)
    name_mapping = defaultdict(str)
    
    # Get all .wav files in directory
    for filepath in Path(directory).glob("*.wav"):
        original_name = filepath.stem.lower()  # Get filename without extension
        
        # Check if we should keep original name
        should_keep = any(
            keep_name in original_name 
            for keep_name in KEEP_NAMES
        )
        
        if should_keep:
            continue  # Skip renaming
        
        # Generate new name
        new_name = f"other_frog_{other_counter}.wav"
        new_path = filepath.with_name(new_name)
        
        # Handle potential name conflicts
        while new_path.exists():
            other_counter += 1
            new_name = f"other_frog_{other_counter}.wav"
            new_path = filepath.with_name(new_name)
        
        # Rename the file
        filepath.rename(new_path)
        print(f"Renamed: {filepath.name} -> {new_name}")
        other_counter += 1

if __name__ == "__main__":
    directory_path = "./frog_sounds_wav/"  # Change to your directory
    rename_frog_files(directory_path)
    print("File renaming complete!")

## Get file Info

In [9]:
import os
from pathlib import Path
import wave
import contextlib
from collections import defaultdict

def analyze_short_frog_files(directory, max_duration=4):
    """
    Finds all WAV files shorter than specified duration and counts by frog type.
    
    Args:
        directory (str): Path to directory containing WAV files
        max_duration (float): Maximum duration in seconds to consider as "short"
        
    Returns:
        tuple: (list of short files with durations, frog type counts, total count)
    """
    short_files = []
    frog_counts = defaultdict(int)
    special_frogs = {
        'mountain_rain_frog',
        'southern_dainty_frog',
        'clicking_stream_frog'
    }

    for filepath in Path(directory).glob('*.wav'):
        try:
            with contextlib.closing(wave.open(str(filepath), 'r')) as wav_file:
                frames = wav_file.getnframes()
                rate = wav_file.getframerate()
                duration = frames / float(rate)
                
                if duration < max_duration:
                    filename = filepath.stem.lower()
                    
                    # Determine frog type
                    frog_type = "other"
                    for special in special_frogs:
                        if special in filename:
                            frog_type = special.replace('_', ' ').title()
                            break
                    
                    short_files.append((filepath.name, duration, frog_type))
                    frog_counts[frog_type] += 1
                    
        except (wave.Error, EOFError) as e:
            print(f"Error processing {filepath.name}: {str(e)}")
            continue
    
    return short_files, frog_counts, len(short_files)

if __name__ == "__main__":
    folder_path = "./frog_sounds_wav/"  # Replace with your folder path
    try:
        short_files, frog_counts, total_count = analyze_short_frog_files(folder_path)
        
        if not short_files:
            print("No files shorter than 5 seconds found.")
        else:
            print("Files shorter than 5 seconds:")
            print("-" * 65)
            print(f"{'Filename':<40} {'Duration':<10} {'Frog Type':<15}")
            print("-" * 65)
            for filename, duration, frog_type in short_files:
                print(f"{filename:<40} {duration:.2f}s{'':<5} {frog_type:<15}")
            
            print("-" * 65)
            print("\nFrog Type Counts:")
            for frog_type, count in sorted(frog_counts.items(), key=lambda x: x[1], reverse=True):
                print(f"{frog_type:<20}: {count}")
            
            print("-" * 65)
            print(f"Total files shorter than 5 seconds: {total_count}")
            
    except Exception as e:
        print(f"Error: {str(e)}")

Files shorter than 5 seconds:
-----------------------------------------------------------------
Filename                                 Duration   Frog Type      
-----------------------------------------------------------------
other_frog_1818.wav                      3.01s      other          
other_frog_3281.wav                      2.40s      other          
other_frog_2736.wav                      2.53s      other          
Clicking_Stream_Frog_191.wav             3.76s      Clicking Stream Frog
other_frog_269.wav                       3.16s      other          
other_frog_111.wav                       2.03s      other          
other_frog_661.wav                       3.37s      other          
other_frog_962.wav                       3.99s      other          
other_frog_1705.wav                      1.08s      other          
other_frog_523.wav                       1.19s      other          
other_frog_3282.wav                      1.21s      other          
other_frog_2594.w

## Clip files to 4 seconds

In [10]:
import os
import numpy as np
from pathlib import Path
from scipy.io import wavfile

def process_audio_files(input_dir, output_dir="clipped_4sec"):
    """
    Process mono audio files:
    - For files >4 seconds: create 4-second clip centered at max amplitude
    - For files <=4 seconds: copy as-is to output directory
    - All files saved to output directory
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    
    # Create output directory if it doesn't exist
    output_path.mkdir(exist_ok=True)
    
    processed_count = 0
    clipped_count = 0
    copied_count = 0
    
    for filepath in input_path.glob('*.wav'):
        try:
            # Read WAV file (mono)
            sample_rate, data = wavfile.read(filepath)
            
            # Ensure mono audio
            if len(data.shape) > 1:
                print(f"Skipping {filepath.name} - stereo audio not supported")
                continue
            
            duration = len(data) / sample_rate
            output_file = output_path / filepath.name
            
            if duration <= 4.0:
                # For short files, just copy to output directory
                wavfile.write(output_file, sample_rate, data)
                copied_count += 1
                print(f"Copied: {filepath.name} ({duration:.2f}s)")
            else:
                # Find point of max amplitude
                max_idx = np.argmax(np.abs(data))
                
                # Calculate 4-second window (2 seconds before and after max amplitude)
                half_window = int(2 * sample_rate)
                start = max(0, max_idx - half_window)
                end = start + 4 * sample_rate
                
                # Adjust if window exceeds file length
                if end > len(data):
                    end = len(data)
                    start = max(0, end - 4 * sample_rate)
                
                # Extract 4-second clip
                clipped_data = data[start:end]
                
                # Ensure exactly 4 seconds (pad with zeros if needed)
                if len(clipped_data) < 4 * sample_rate:
                    clipped_data = np.pad(clipped_data, 
                                        (0, 4 * sample_rate - len(clipped_data)), 
                                        mode='constant')
                
                # Save clipped file
                wavfile.write(output_file, sample_rate, clipped_data)
                clipped_count += 1
                print(f"Clipped: {filepath.name} ({duration:.2f}s -> 4.00s)")
                
            processed_count += 1
            
        except Exception as e:
            print(f"Error processing {filepath.name}: {str(e)}")
            continue
    
    print("\nProcessing complete!")
    print(f"Processed {processed_count} files")
    print(f"- {clipped_count} files clipped to 4 seconds")
    print(f"- {copied_count} files copied as-is (<=4 seconds)")
    print(f"Output directory: {output_path.resolve()}")

if __name__ == "__main__":
    input_directory = "./frog_sounds_wav/"  # Replace with your input directory
    process_audio_files(input_directory)

Clipped: Southern_Dainty_Frog_660.wav (60.00s -> 4.00s)
Clipped: other_frog_708.wav (60.00s -> 4.00s)
Copied: other_frog_1818.wav (3.01s)
Clipped: Mountain_Rain_Frog_291.wav (60.00s -> 4.00s)
Clipped: other_frog_817.wav (60.00s -> 4.00s)
Clipped: other_frog_2425.wav (60.00s -> 4.00s)
Clipped: other_frog_3113.wav (60.00s -> 4.00s)
Clipped: other_frog_1119.wav (60.00s -> 4.00s)
Copied: other_frog_3281.wav (2.40s)
Clipped: other_frog_441.wav (28.12s -> 4.00s)
Clipped: other_frog_2310.wav (60.00s -> 4.00s)
Clipped: other_frog_3214.wav (60.00s -> 4.00s)
Clipped: Clicking_Stream_Frog_154.wav (6.83s -> 4.00s)
Clipped: other_frog_325.wav (60.00s -> 4.00s)
Clipped: Southern_Dainty_Frog_576.wav (60.00s -> 4.00s)
Clipped: other_frog_3285.wav (60.00s -> 4.00s)
Clipped: Southern_Dainty_Frog_333.wav (60.00s -> 4.00s)
Clipped: other_frog_1779.wav (60.00s -> 4.00s)
Clipped: Mountain_Rain_Frog_231.wav (60.00s -> 4.00s)
Clipped: other_frog_208.wav (60.00s -> 4.00s)
Clipped: Southern_Dainty_Frog_139.wav 

## Convert all of the audio files into Spectrograms and save as .h5

In [1]:
import os
import numpy as np
import librosa
from joblib import Parallel, delayed
import multiprocessing
import h5py
from tqdm import tqdm

# Configuration
AUDIO_DIR = "./clipped_4sec"
OUTPUT_DIR = "./Datasets/FD-0.3/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

SAMPLE_RATE = 22050
N_FFT = 2048
HOP_LENGTH = 256
N_MELS = 128

def process_file(file):
    try:
        if file.lower().endswith('.wav'):
            audio_path = os.path.join(AUDIO_DIR, file)
            y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, res_type='kaiser_fast', mono=True)
            S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS)
            S_dB = librosa.power_to_db(S, ref=np.max)
            key = os.path.splitext(file)[0]
            return key, S_dB
    except Exception as e:
        print(f"\nSkipped {file}: {str(e)}")
        return None

# Get list of files first for accurate tqdm progress
files = [f for f in os.listdir(AUDIO_DIR) if f.lower().endswith('.wav')]
print(f"Processing {len(files)} audio files...")


batch_size = 500
for i in tqdm(range(0, len(files), batch_size)):
    batch = files[i:i+batch_size]
    results = Parallel(n_jobs=2)(delayed(process_file)(f) for f in batch)
    with h5py.File(os.path.join(OUTPUT_DIR, 'spectrograms.h5'), 'a') as hf:  # 'a' for append
        for result in results:
            if result is not None:
                key, S_dB = result
                hf.create_dataset(key, data=S_dB)


print(f"\nDone! Spectrograms saved to {OUTPUT_DIR}")

Processing 6141 audio files...


  0%|          | 0/13 [00:05<?, ?it/s]


ValueError: Unable to synchronously create dataset (name already exists)