### ReadMe
This code should take the csv file of the timestamps and clean the humbug DB based on the csv segments


since I only have the 5 classes in the csv timestamps, this code assumes a directory for each class, with all the segmented files in the format ID-segment_segmentnNumber.wav, for example: 200415-segment_1.wav

In [None]:
import os
import pandas as pd
from pathlib import Path
from typing import List
from tqdm import tqdm

### Create the paths and load the csv

In [None]:
#get path to segmented folders.
#each name corrosponds to the type in the csv
SourcePath = Path("drive/MyDrive/HumBug/Segmented") #set your own
destinationPaths = {}
for folder in SourcePath.iterdir():
  destinationPaths[folder.name[:-4]] = folder
destinationPaths

{'culex pipiens complex': PosixPath('drive/MyDrive/HumBug/Segmented/culex pipiens complex_seg'),
 'an funestus ss': PosixPath('drive/MyDrive/HumBug/Segmented/an funestus ss_seg'),
 'ae aegypti': PosixPath('drive/MyDrive/HumBug/Segmented/ae aegypti_seg'),
 'background': PosixPath('drive/MyDrive/HumBug/Segmented/background_seg'),
 'an arabiensis': PosixPath('drive/MyDrive/HumBug/Segmented/an arabiensis_seg')}

In [None]:
#code to print number of files in each folder
for t, folder in destinationPaths.items():
  print(folder.name, len(os.listdir(folder)))

culex pipiens complex_seg 7911
an funestus ss_seg 7235
ae aegypti_seg 1283
background_seg 5555
an arabiensis_seg 14422


In [None]:
#read csv
df = pd.read_csv("Cleaned Mosquitoes.csv")
df

Unnamed: 0,id,type,timeStamp
0,222312,an arabiensis,21-22
1,220055,culex pipiens complex,3-4
2,220218,culex pipiens complex,9-10
3,220393,ae aegypti,0-1
4,220945,culex pipiens complex,0-1
...,...,...,...
3233,221037,culex pipiens complex,0-1
3234,220376,an arabiensis,20-21
3235,220124,an funestus ss,0-1
3236,221661,culex pipiens complex,2-3


### Method that deletes files in folders from df

In [None]:
def cleanHumBug(paths: dict, csv: pd.DataFrame, dryRun: bool):
  deleted_files = []
  for index, row in tqdm(csv.iterrows()):
    #get name of file. id-segment_number.wav
    timeStamp = row['timeStamp']
    segmentNumber = timeStamp[timeStamp.index('-')+1 : ]

    file_name = f"{row['id']}-Segment_{segmentNumber}.wav"
    cls = row['type']
    try:
      file_path = paths[cls] / file_name
    except KeyError:
      print(f"KeyError: {cls} not found in paths")
      continue

    # Check if the file exists
    # Delete the file
    try:
      if file_path.exists():
        deleted_files.append(file_path)
        if not dryRun:
          file_path.unlink()

    except Exception as e:
      print(f"Error deleting {file_path}: {e}")

  return deleted_files

In [None]:
deleted = cleanHumBug(destinationPaths, df, dryRun=True)

3238it [00:01, 2186.89it/s]


### Make a copy of the segments - deleteing is risky

In [None]:
import shutil

# Source directory: SourcePath
# Destination directory
CopyPath = SourcePath.parent / "Redundant Segments"

if not os.path.exists(CopyPath):
  shutil.copytree(SourcePath, CopyPath)
else:
  print("Backup folder already exists.")

Backup folder already exists.


### Delete Bad Segments

In [None]:
deleted = cleanHumBug(destinationPaths, df, dryRun=False)

3238it [00:09, 347.49it/s]


In [None]:
#recheck the number of files in each class
for t, folder in destinationPaths.items():
  print(folder.name, len(os.listdir(folder)))

culex pipiens complex_seg 6733
an funestus ss_seg 6801
ae aegypti_seg 1139
background_seg 5555
an arabiensis_seg 12940
