### Readme
this notebook segmetns each species recordings into 1s segments, discarding the leftover un-complete last segment if it wasn't integer length.

it also downsamples the recordings (this is a TinyML project originally) and applies a low-pass filter to prevent any aliasing...

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import shutil


import scipy.signal
import soundfile as sf

#### Source paths

In [2]:
destin = Path('drive/MyDrive/HumBug') #destination of the segmentation directory, where you want each species directory to be saved
classes= ['an arabiensis', 'culex pipiens complex', 'an funestus ss', 'ae aegypti', 'background']
#create a path object for each class. each class will have it's own folder
paths  = {}
for c in classes:
  paths[c] = destin/c

#make sure each file path object directory exist using the pathlib
for c,path in paths.items():
  path.mkdir(parents=True, exist_ok=True)
  if not path.exists():
    print(f"Warning: {path} was not created.")

In [4]:
#final folder paths
folderPaths = [path for path in paths.values()]
folderPaths

[PosixPath('drive/MyDrive/HumBug/an arabiensis'),
 PosixPath('drive/MyDrive/HumBug/culex pipiens complex'),
 PosixPath('drive/MyDrive/HumBug/an funestus ss'),
 PosixPath('drive/MyDrive/HumBug/ae aegypti'),
 PosixPath('drive/MyDrive/HumBug/background')]

#### Segmentation of a singal folder

In [5]:
def segmentFolder(folder: Path, target_sr=16_000, segment_duration=1):
  output_path = folder.parent/'Segmented'/(folder.name+'_seg')
  output_path.mkdir(parents=True, exist_ok=True)
  excludeFiles = [] #names of files you want to skip ...
  print(f"- Segmenting {folder.name}")

  for i, path in enumerate(folder.iterdir()):
    if path.name in excludeFiles:
      continue

    try:
      #slight update on progress
      if i%500 == 0:
        print(f"Folder: {folder.name} - Finished: {i}")

      #load path
      original_audio, original_sr = sf.read(path)


      #low-pass filter and downsample
      gcd = np.gcd(original_sr, target_sr)
      up = target_sr // gcd
      down = original_sr // gcd

      resampled_audio = scipy.signal.resample_poly(original_audio, up, down)

      #segment into segmentLength segments, than export them into output folder.
      segment_samples = target_sr * segment_duration
      num_segments = len(resampled_audio) // segment_samples #break last segment.

      # Slice and save each segment
      for j in range(num_segments):
          start = j * segment_samples
          end = start + segment_samples
          segment = resampled_audio[start:end]

          if segment.shape[0] != segment_samples:
            print(f'path: {path} - got segment shape of: {segment.shape[0]}')
            continue

          outputFile = output_path/(path.stem + f'-Segment_{j+1}.wav')
          if outputFile.exists():
            continue

          sf.write(outputFile, segment, target_sr)
    except Exception as e:
      print(f"Error processing {path}: {e}")

  print(f"- {folder.name} is Done.")

#### Run it in parallel

In [6]:
from concurrent.futures import ProcessPoolExecutor as Executor
if __name__ == "__main__":
  with Executor() as executor:
    list(executor.map(segmentFolder, folderPaths))


- Segmenting an arabiensis- Segmenting culex pipiens complex

Folder: an arabiensis - Finished: 0
Folder: culex pipiens complex - Finished: 0
Folder: an arabiensis - Finished: 500
- an arabiensis is Done.
- Segmenting an funestus ss
Folder: an funestus ss - Finished: 0
- an funestus ss is Done.
- Segmenting ae aegypti
Folder: ae aegypti - Finished: 0
- ae aegypti is Done.
- Segmenting background
Folder: background - Finished: 0
Folder: culex pipiens complex - Finished: 500
- culex pipiens complex is Done.
- background is Done.
