In [None]:
import cv2
from ultralytics import YOLO
import numpy as np
from gtts import gTTS
from playsound import playsound
import time
import os
import threading
from datetime import datetime

# ==============음성경고 중복 방지를 위한 변수 초기화 및 음성출력 함수 선언 ====================
last_alert_time = 0
last_alert_label = None
alert_interval = 3

#멀티스레딩 선언
def speak_multi_thread(text):
    threading.Thread(target = speak, args=(text,),daemon=True).start()

def speak(text):
    try:
        tts = gTTS(text = text, lang= 'ko')
        filename="temp.mp3"
        tts.save(filename)
        playsound(filename)
        os.remove(filename)
    except Exception as e:
        print("TTS error:", e)
# =============================================

# ==============MiDaS 추가 ====================
midas_model = cv2.dnn.readNet("model-small.onnx")
frame_idx = 0
depth = None

def estimate_depth(frame):
    blob = cv2.dnn.blobFromImage(frame, 1/255.0, (256,256), swapRB=True, crop=False)
    midas_model.setInput(blob)
    output = midas_model.forward()
    depth_map = output[0,:,:]
    return cv2.resize(
        cv2.normalize(depth_map, None, 0,1,cv2.NORM_MINMAX),(frame.shape[1], frame.shape[0]))
# =============================================

Danger_classes = ['car','bus', 'truck', 'person', 'bicycle', 'traffic light']

cap = cv2.VideoCapture('SeoulWalk.mp4')
yolo_model= YOLO('yolov8n.pt')
if cap.isOpened() == False:
    print("비디오 열기 실패")
    exit()

while True:
    ret, frame = cap.read()
    
# ==============MiDaS 추가 ====================
    if frame_idx % 5 == 0 or depth_map is None:
        depth_map = estimate_depth(frame)
# =============================================
    
    if ret == False:
        break
    results = yolo_model(frame, conf=0.5, verbose=False)[0]
    #frame = results.plot()

    for box in results.boxes:
        cls_id = int(box.cls[0])
        label = yolo_model.names[cls_id]
        if label in Danger_classes:
            x1,y1,x2,y2 = map(int, box.xyxy[0])
# ==============MiDaS 추가 ====================
            cx, cy = int ((x1 + x2) /2), int((y1 + y2) /2 )
            depth_score = depth_map[cy,cx]
            if depth_score > 0.6:
                cv2.putText(frame, f"{label} near!", (x1, y1 - 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,0,255), 2)
# =============================================
            cv2.putText(frame, f"{label} detected!", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,255,255), 2)

            # ==============음성출력 ======================
            now = time.time()
            if label != last_alert_label or now - last_alert_time > alert_interval:
                speak_multi_thread(f"{label}가 가까이 있습니다 조심하세요")
                last_alert_label = label
                last_alert_time = now
            # =============================================
    cv2.imshow("GuidePath", frame)
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break
        
# ==============MiDaS 추가 ====================
    frame_idx += 1
# =============================================

cap.release()
cv2.destroyAllWindows()