In [1]:
import cv2
import numpy as np
from depth_estimation import load_depth_model, estimate_depth
from ultralytics import YOLO
from PIL import Image

In [19]:
def load_model():
    # Load a pre-trained YOLOv8 model
    model = YOLO('yolov8n.pt')
    return model

In [20]:
def detect_objects(image, model, conf_threshold=0.5):
    """
    Detect objects in an image using YOLOv8
    
    Args:
        image: Can be a file path, PIL Image, or numpy array
        model: YOLO model from load_model()
        conf_threshold: Confidence threshold for detections
        
    Returns:
        Dictionary with boxes, labels, scores
    """
    # Process the image depending on its type
    if isinstance(image, str):
        # No need to preprocess if it's a path - YOLO handles this
        pass
    elif not isinstance(image, (Image.Image, np.ndarray)):
        raise ValueError("Invalid image input. Provide a PIL Image, file path, or numpy array")
    
    # Run inference with YOLO
    results = model(image, conf=conf_threshold)
    
    # Process results into a similar format as you were using before
    boxes = []
    labels = []
    scores = []
    
    # Extract detection information
    for r in results:
        for box in r.boxes:
            # Get coordinates
            x1, y1, x2, y2 = box.xyxy[0].tolist()  # Convert to list
            boxes.append([x1, y1, x2, y2])
            
            # Get class and confidence
            cls = int(box.cls[0])
            labels.append(r.names[cls])
            scores.append(float(box.conf[0]))
    
    return {
        "boxes": boxes,
        "labels": labels,
        "scores": scores
    }

In [21]:
def test_depth_estimation(image_path):
    # Load the depth model
    midas_model = load_depth_model()

    # Read the image
    image = cv2.imread(image_path)

    if image is None:
        print("Error: Unable to load image")
        return

    # Perform depth estimation
    depth_map = estimate_depth(image, midas_model)

    # Resize depth map to match the original image size
    depth_map_resized = cv2.resize(depth_map, (image.shape[1], image.shape[0]))

    # color map
    # depth_map_color = cv2.applyColorMap(depth_map, cv2.COLORMAP_JET)

    # Display the original image and the depth map
    cv2.imshow("Original Image", image)
    cv2.imshow("Depth Map", depth_map_resized)

    cv2.waitKey(0)  # Wait until any key is pressed
    cv2.destroyAllWindows()

In [28]:
image_path = "C:/Users/tonyl/Downloads/Screenshot 2025-04-18 135242.png"
# test_depth_estimation(image_path)
depth_model = load_depth_model()
frame = cv2.imread(image_path)
depth_map = estimate_depth(frame, depth_model)

In [29]:
object_detect = load_model()
results = detect_objects(image_path, object_detect, conf_threshold=0.3)


image 1/1 C:\Users\tonyl\Downloads\Screenshot 2025-04-18 135242.png: 384x640 1 person, 1 bed, 35.9ms
Speed: 1.1ms preprocess, 35.9ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)


In [30]:
#Print object detection + depth results
for box, label, score in zip(results["boxes"], results["labels"], results["scores"]):
    x1, y1, x2, y2 = [int(coord) for coord in box]

    # Clamp coordinates to be within depth_map size
    x1 = max(0, min(x1, depth_map.shape[1] - 1))
    x2 = max(0, min(x2, depth_map.shape[1] - 1))
    y1 = max(0, min(y1, depth_map.shape[0] - 1))
    y2 = max(0, min(y2, depth_map.shape[0] - 1))

    # Slice the depth map for the region inside the box
    object_depth = depth_map[y1:y2, x1:x2]

    if object_depth.size > 0:
        average_depth = object_depth.mean()
        print(f"{label} ({score:.2f}) → Avg depth: {average_depth:.2f}")
    else:
        print(f"{label} has an empty depth region — box might be out of bounds")

person (0.93) → Avg depth: 118.78
bed (0.36) → Avg depth: 14.55


In [1]:
from kokoro_audio import text_to_audio, load_kokoro_model

In [2]:
kokoro = load_kokoro_model()



  WeightNorm.apply(module, name, dim)


In [None]:
# testing kokoro
text = "Once upon a midnight dreary, while I pondered, " \
"weak and weary, over many a quaint and curious volume of forgotten lore— " \
"while I nodded, nearly napping, suddenly there came a tapping, as of " \
"someone gently rapping, rapping at my chamber door. 'Tis some visitor,' " \
"I muttered, 'tapping at my chamber door— only this and nothing more.'"
audio = text_to_audio(kokoro, text, output_dir="./audio_output", file_name="output")

0 Once upon a midnight dreary, while I pondered, weak and weary, over many a quaint and curious volume of forgotten lore— while I nodded, nearly napping, suddenly there came a tapping, as of someone gently rapping, rapping at my chamber door. 'Tis some visitor,' I muttered, 'tapping at my chamber door— only this and nothing more.' wˈʌns əpˈɑn ɐ mˈɪdnˌIt dɹˈɪɹi, wˌIl ˌI pˈɑndəɹd, wˈik ænd wˈɪɹi, ˈOvəɹ mˈɛni ɐ kwˈAnt ænd kjˈʊɹiəs vˈɑljˌum ʌv fəɹɡˈɑtn lˈɔɹ— wˌIl ˌI nˈɑdᵻd, nˈɪɹli nˈæpɪŋ, sˈʌdᵊnli ðɛɹ kˈAm ɐ tˈæpɪŋ, æz ʌv sˈʌmwˌʌn ʤˈɛntli ɹˈæpɪŋ, ɹˈæpɪŋ æt mI ʧˈAmbəɹ dˈɔɹ. ”tˈiz sˌʌm vˈɪzəTəɹ,” ˌI mˈʌTəɹd, ”tˈæpɪŋ æt mI ʧˈAmbəɹ dˈɔɹ— ˈOnli ðɪs ænd nˈʌθɪŋ mˈɔɹ.”


In [None]:
#Decoding base64 audio
import base64
import json

with open('output.json', 'r') as file:
    audio_base64 = json.load(file)['audio_base64']

# Decode the base64 string into bytes
audio_bytes = base64.b64decode(audio_base64)

# Save the decoded audio to a file 
with open("output_audio.wav", "wb") as f:
    f.write(audio_bytes)

UklGRlTNAgBXQVZFZm10IBAAAAABAAEAwF0AAIC7AAACABAAZGF0YTDNAgD//wAA//8AAP///////wAAAAD//wAA/////wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA//8AAP//AAAAAAAAAAD//wAA//8AAAAA//8AAP//AAAAAP//AAAAAAAA//8AAAAA/////wAAAAAAAP////8AAAAAAAD/////AAD//wAA//8AAP////8AAAAAAAD//wAA//8AAP////8AAP//AAD//wAAAAAAAAAA//8AAP//AAAAAAAA/////wAAAAD//wAAAAAAAAAA//8AAP////8AAP//AAD/////AAD//wAA//8AAAAAAAD//////////wAA//8AAAAAAAAAAP//AAD//////////wAA