# Identifiy multiple objects and produce mixed audios

## Installing packages

Following code has been excuted in local terminal

In [None]:
#!git clone https://github.com/RoySheffer/im2wav.git
#%cd im2wav
#!pip install -r requirements.txt

fatal: destination path 'im2wav' already exists and is not an empty directory.


### Code for process images

In [2]:
import os
from PIL import Image
from ultralytics import YOLO
from pathlib import Path
import subprocess
from pydub import AudioSegment
import sys
from collections import defaultdict

## Crop Images

In [28]:
#  Create directories for cropped images
def setup_directories(crops_dir):
    os.makedirs(crops_dir, exist_ok=True)
    
# Using YOLOv8 for animal detection and cropping
def detect_and_crop_animals(image_path, crops_dir, yolo, threshold=0.5):
    img = Image.open(image_path).convert('RGB')
    results = yolo(image_path)[0]

    TARGET_CLASSES = {"dog", "cat", "cow", "sheep", "horse", "elephant", "bird", "lion"}
    detections = []
    for box, conf, cls_id in zip(results.boxes.xyxy.cpu(), results.boxes.conf.cpu(), results.boxes.cls.cpu()):
        name = yolo.names[int(cls_id)]
        if name in TARGET_CLASSES and conf > threshold:
            detections.append((name, conf.item(), box.tolist()))

    best_detections = {}
    for name, conf, box in detections:
        if name not in best_detections or conf > best_detections[name][0]:
            best_detections[name] = (conf, box)

    if not best_detections:
        print(f"No valid animals detected in {image_path}.")
        return []

    saved_crop_paths = []
    for idx, (name, (conf, box)) in enumerate(best_detections.items()):
        x1, y1, x2, y2 = map(int, box)
        crop = img.crop((x1, y1, x2, y2))
        crop_filename = f"{Path(image_path).stem}_{name}{idx}.jpg"
        crop_path = os.path.join(crops_dir, crop_filename)
        crop.save(crop_path)
        saved_crop_paths.append(crop_path)
        print(f"Animal Identified: {name}. Crop saved: {crop_path}")
    
    return saved_crop_paths

def process_folder(folder_path, crops_dir, threshold=0.5, has_lion=False):
    os.makedirs(crops_dir, exist_ok=True)
    if has_lion: 
        yolo = YOLO("E:/LSE/ST311/ST311-Group-Project/YOLOv8-Experiments/best.pt")
    else:
        yolo = YOLO('yolov8x.pt')
    image_extensions = {".jpg", ".jpeg", ".png", ".bmp"}

    for file in os.listdir(folder_path):
        if Path(file).suffix.lower() in image_extensions:
            image_path = os.path.join(folder_path, file)
            print(f"\nProcessing: {image_path}")
            detect_and_crop_animals(image_path, crops_dir, yolo, threshold=threshold)

In [None]:
# Crop images
folder_path = Path("E:/LSE/ST311/ST311-Group-Project/Mixed animals")
crops_dir = Path("E:/LSE/ST311/ST311-Group-Project/mixed_crop")

process_folder(folder_path, crops_dir, threshold=0.3)
# To expanding the dataset, we use the same method added the crops of elephant_bird, horse_sheep, and cat_bird


Processing: E:\LSE\ST311\ST311-Group-Project\Mixed animals\cow_dog1.jpg

image 1/1 E:\LSE\ST311\ST311-Group-Project\Mixed animals\cow_dog1.jpg: 448x640 1 dog, 1 cow, 62.6ms
Speed: 3.3ms preprocess, 62.6ms inference, 1.6ms postprocess per image at shape (1, 3, 448, 640)
Animal Identified: cow. Crop saved: E:\LSE\ST311\ST311-Group-Project\mixed_crop\cow_dog1_cow0.jpg
Animal Identified: dog. Crop saved: E:\LSE\ST311\ST311-Group-Project\mixed_crop\cow_dog1_dog1.jpg

Processing: E:\LSE\ST311\ST311-Group-Project\Mixed animals\cow_dog2.jpg

image 1/1 E:\LSE\ST311\ST311-Group-Project\Mixed animals\cow_dog2.jpg: 448x640 1 dog, 4 cows, 61.4ms
Speed: 2.0ms preprocess, 61.4ms inference, 7.1ms postprocess per image at shape (1, 3, 448, 640)
Animal Identified: cow. Crop saved: E:\LSE\ST311\ST311-Group-Project\mixed_crop\cow_dog2_cow0.jpg
Animal Identified: dog. Crop saved: E:\LSE\ST311\ST311-Group-Project\mixed_crop\cow_dog2_dog1.jpg

Processing: E:\LSE\ST311\ST311-Group-Project\Mixed animals\cow_d

Crop images with lions(a species that yolov8 hasn't been trained on)

In [29]:
# Crop images
folder_path = Path("E:/LSE/ST311/ST311-Group-Project/Mixed animals/elephant_lion")
crops_dir = Path("E:/LSE/ST311/ST311-Group-Project/mixed_crop/elephant_lion")

process_folder(folder_path, crops_dir, threshold=0.3)


Processing: E:\LSE\ST311\ST311-Group-Project\Mixed animals\elephant_lion\elephant_lion1.jpg

image 1/1 E:\LSE\ST311\ST311-Group-Project\Mixed animals\elephant_lion\elephant_lion1.jpg: 416x640 1 cow, 1 elephant, 1 zebra, 151.8ms
Speed: 1.8ms preprocess, 151.8ms inference, 2.4ms postprocess per image at shape (1, 3, 416, 640)
Animal Identified: elephant. Crop saved: E:\LSE\ST311\ST311-Group-Project\mixed_crop\elephant_lion\elephant_lion1_elephant0.jpg
Animal Identified: cow. Crop saved: E:\LSE\ST311\ST311-Group-Project\mixed_crop\elephant_lion\elephant_lion1_cow1.jpg

Processing: E:\LSE\ST311\ST311-Group-Project\Mixed animals\elephant_lion\elephant_lion3.jpg

image 1/1 E:\LSE\ST311\ST311-Group-Project\Mixed animals\elephant_lion\elephant_lion3.jpg: 640x512 1 elephant, 150.7ms
Speed: 2.7ms preprocess, 150.7ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 512)
Animal Identified: elephant. Crop saved: E:\LSE\ST311\ST311-Group-Project\mixed_crop\elephant_lion\elephant_lion3_el

Original YOLO model successfully identified and cropped elephants, however neither failed to detect lions nor misclassified some lion as cows

In [27]:
# Crop images
folder_path = Path("E:/LSE/ST311/ST311-Group-Project/Mixed animals/elephant_lion")
crops_dir = Path("E:/LSE/ST311/ST311-Group-Project/mixed_crop/elephant_lion")

process_folder(folder_path, crops_dir, threshold=0.3, has_lion=True)


Processing: E:\LSE\ST311\ST311-Group-Project\Mixed animals\elephant_lion\elephant_lion1.jpg

image 1/1 E:\LSE\ST311\ST311-Group-Project\Mixed animals\elephant_lion\elephant_lion1.jpg: 416x640 2 lions, 47.0ms
Speed: 4.8ms preprocess, 47.0ms inference, 4.0ms postprocess per image at shape (1, 3, 416, 640)
Animal Identified: lion. Crop saved: E:\LSE\ST311\ST311-Group-Project\mixed_crop\elephant_lion\elephant_lion1_lion0.jpg

Processing: E:\LSE\ST311\ST311-Group-Project\Mixed animals\elephant_lion\elephant_lion2.jpg

image 1/1 E:\LSE\ST311\ST311-Group-Project\Mixed animals\elephant_lion\elephant_lion2.jpg: 384x640 2 lions, 40.1ms
Speed: 4.3ms preprocess, 40.1ms inference, 4.4ms postprocess per image at shape (1, 3, 384, 640)
Animal Identified: lion. Crop saved: E:\LSE\ST311\ST311-Group-Project\mixed_crop\elephant_lion\elephant_lion2_lion0.jpg

Processing: E:\LSE\ST311\ST311-Group-Project\Mixed animals\elephant_lion\elephant_lion3.jpg

image 1/1 E:\LSE\ST311\ST311-Group-Project\Mixed anima

By loading the YOLO model finetuned on a lion-specific dataset, we successfully extracted high-quality lion crops. However, this finetuning process caused the model to lose its ability to recognize other animal species. This situation called catastrophic forgetting, which occurs when the model is updated with new information at the expense of previously learned knowledge. 

In future work, given adequate computational resources, we plan to explore more balanced and comprehensive datasets that allow us to finetune YOLO on additional animal classes while preserving its original recognition capabilities.

In [30]:
# Clip embeddings

def run_clip_for_each_folder(mixed_crop_dir, output_pickle_dir, img_clip_script_path):
    os.makedirs(output_pickle_dir, exist_ok=True)

    for folder_name in os.listdir(mixed_crop_dir):
        folder_path = os.path.join(mixed_crop_dir, folder_name)
        if not os.path.isdir(folder_path):
            continue

        print(f"\nProcessing folder: {folder_name}")
        
        image_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path)
                       if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

        if not image_paths:
            print(f"  No images found in {folder_path}. Skipping.")
            continue

        path_list_txt = os.path.join(output_pickle_dir, f"{folder_name}_paths.txt")
        with open(path_list_txt, 'w') as f:
            for path in image_paths:
                f.write(path + '\n')

        try:
            result = subprocess.run(
                [
                    sys.executable,
                    img_clip_script_path,
                    "-save_dir", output_pickle_dir,
                    "-path_list", path_list_txt,
                    "-file_name", folder_name
                ],
                check=True,
                capture_output=True,
                text=True
            )
            print(result.stdout)
        except subprocess.CalledProcessError as e:
            print(f"Failed to generate CLIP for {folder_name}")
            print("stdout:", e.stdout)
            print("stderr:", e.stderr)

In [32]:
print(os.getcwd())
os.chdir(Path("E:/LSE/ST311/ST311-Group-Project/im2wav/run"))
print(os.getcwd())

E:\LSE\ST311\ST311-Group-Project\im2wav\run
E:\LSE\ST311\ST311-Group-Project\im2wav\run


In [None]:
output_pickle_dir = Path("E:/LSE/ST311/ST311-Group-Project/clip_pickles")
img_clip_script_path = Path("E:/LSE/ST311/ST311-Group-Project/im2wav/Data/preprocess/img_clip.py")

run_clip_for_each_folder(crops_dir, output_pickle_dir, img_clip_script_path)
# To expanding the dataset, we use the same method added the crops of elephant_bird, horse_sheep, and elephant_lion


Processing folder: cat_bird
{'save_dir': 'E:\\LSE\\ST311\\ST311-Group-Project\\clip_pickles', 'path_list': 'E:\\LSE\\ST311\\ST311-Group-Project\\clip_pickles\\cat_bird_paths.txt', 'single_pickle': False, 'file_name': 'cat_bird'}
image features class:  cat_bird1_bird1 (1, 512)
image features class:  cat_bird1_cat0 (1, 512)
image features class:  cat_bird2_bird1 (1, 512)
image features class:  cat_bird2_cat0 (1, 512)
Saved all features into E:\LSE\ST311\ST311-Group-Project\clip_pickles\cat_bird.pickle


Processing folder: cow_dog
{'save_dir': 'E:\\LSE\\ST311\\ST311-Group-Project\\clip_pickles', 'path_list': 'E:\\LSE\\ST311\\ST311-Group-Project\\clip_pickles\\cow_dog_paths.txt', 'single_pickle': False, 'file_name': 'cow_dog'}
image features class:  cow_dog1_cow0 (1, 512)
image features class:  cow_dog1_dog1 (1, 512)
image features class:  cow_dog2_cow0 (1, 512)
image features class:  cow_dog2_dog1 (1, 512)
image features class:  cow_dog5_cow0 (1, 512)
image features class:  cow_dog5_dog1

## Inference

In [3]:
def run_sample_generation(run_dir, pkl_file, sample_script_path, experiment_name, run_script_dir):
    '''
    Generate audio from crops using the im2wav model
    '''
    print("Generating audio from crops...")
    try:
        subprocess.run(
            [
                "python", sample_script_path,
                "-bs", "2",
                "-wav_per_object", "1",
                "-experiment_name", experiment_name,
                "-CLIP_dict", pkl_file,
                "-models", "im2wav",
                "-save_dir", run_dir
            ],
            check=True,
            cwd=run_script_dir,
                capture_output=True,
                text=True             
        )
    except subprocess.CalledProcessError as e:
        print("Subprocess failed with error:\n")
        print(">>> STDOUT:\n", e.stdout)
        print(">>> STDERR:\n", e.stderr)
        raise

    audios_dir = os.path.join(run_dir, f"/{experiment_name}/k_top0_p_top0/im2wav/l1")
    return audios_dir

In [4]:
print(os.getcwd())
os.chdir(Path("E:/LSE/ST311/ST311-Group-Project/im2wav/run"))
print(os.getcwd())

e:\LSE\ST311\ST311-Group-Project
E:\LSE\ST311\ST311-Group-Project\im2wav\run


In [43]:
animal_name = "cow_horse"

sample_script_path = os.path.abspath("E:/LSE/ST311/ST311-Group-Project/im2wav/models/sample.py")
run_script_dir = os.path.dirname(sample_script_path)
run_dir = os.path.abspath("E:/LSE/ST311/ST311-Group-Project/audio_output")
pickle_path = os.path.abspath(f"E:/LSE/ST311/ST311-Group-Project/clip_pickles/{animal_name}.pickle")
run_sample_generation(run_dir, pickle_path, sample_script_path, animal_name, run_script_dir)

Generating audio from crops...


'E:/cow_horse/k_top0_p_top0/im2wav/l1'

## Mix audios

In [46]:
from collections import defaultdict
from pydub import AudioSegment

def mix_wav_files(audios_dir, output_audio_dir):
    os.makedirs(output_audio_dir, exist_ok=True)

    wav_files = [f for f in os.listdir(audios_dir) if f.endswith('.wav')]

    grouped = defaultdict(list)
    for wav in wav_files:
        parts = wav.split("_")
        if len(parts) >= 3:
            group_key = parts[1] + "_" + parts[2]  # 'cat_bird1'
            grouped[group_key].append(wav)

    if not grouped:
        print(f"No groups found in: {audios_dir}")
        return

    for group_key, files in grouped.items():
        if len(files) < 2:
            print(f"Skipping group '{group_key}' (only {len(files)} file)")
            continue

        base_audio = AudioSegment.from_file(os.path.join(audios_dir, files[0]))
        for wav in files[1:]:
            new_audio = AudioSegment.from_file(os.path.join(audios_dir, wav))
            base_audio = base_audio.overlay(new_audio)

        output_path = os.path.join(output_audio_dir, f"mixed_{group_key}.wav")
        base_audio.export(output_path, format="wav")
        print(f"Mixed: {output_path}")

def mix_all_in_audio_output(audio_output_dir):
    for exp_name in os.listdir(audio_output_dir):
        exp_path = os.path.join(audio_output_dir, exp_name)
        if not os.path.isdir(exp_path):
            continue

        l1_path = os.path.join(exp_path, "k_top0_p_top0", "im2wav", "l1")
        if not os.path.exists(l1_path):
            print(f"Skip {exp_name}: l1 path not found.")
            continue

        print(f"\nProcessing: {exp_name}")
        output_mix_dir = os.path.join(exp_path, "mixed")
        mix_wav_files(l1_path, output_mix_dir)

In [47]:
mix_all_in_audio_output("E:/LSE/ST311/ST311-Group-Project/audio_output")


Processing: cat_bird
Mixed: E:/LSE/ST311/ST311-Group-Project/audio_output\cat_bird\mixed\mixed_cat_bird1.wav
Mixed: E:/LSE/ST311/ST311-Group-Project/audio_output\cat_bird\mixed\mixed_cat_bird2.wav

Processing: cow_dog
Mixed: E:/LSE/ST311/ST311-Group-Project/audio_output\cow_dog\mixed\mixed_cow_dog1.wav
Mixed: E:/LSE/ST311/ST311-Group-Project/audio_output\cow_dog\mixed\mixed_cow_dog2.wav
Mixed: E:/LSE/ST311/ST311-Group-Project/audio_output\cow_dog\mixed\mixed_cow_dog5.wav

Processing: cow_horse
Mixed: E:/LSE/ST311/ST311-Group-Project/audio_output\cow_horse\mixed\mixed_cow_horse1.wav
Mixed: E:/LSE/ST311/ST311-Group-Project/audio_output\cow_horse\mixed\mixed_cow_horse2.wav
Mixed: E:/LSE/ST311/ST311-Group-Project/audio_output\cow_horse\mixed\mixed_cow_horse3.wav
Mixed: E:/LSE/ST311/ST311-Group-Project/audio_output\cow_horse\mixed\mixed_cow_horse4.wav

Processing: cow_sheep
Mixed: E:/LSE/ST311/ST311-Group-Project/audio_output\cow_sheep\mixed\mixed_cow_sheep2.wav
Mixed: E:/LSE/ST311/ST311-G