In [1]:
import cv2
import time
import torch
import numpy as np
import mediapipe as mp
import matplotlib.pyplot as plt
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator

In [2]:
cellphone_model = YOLO('yolov8n.pt') 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
MARGIN = 10
def Hand_detector(image):
    with mp.solutions.hands.Hands(static_image_mode=True, max_num_hands=2, model_complexity=1, min_detection_confidence=0.2, min_tracking_confidence=0.5) as hands:
        image.flags.writeable = False
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results_hands = hands.process(image)
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        height, width, _ = image.shape
        if results_hands.multi_hand_landmarks:
            for i in range(len(results_hands.multi_hand_landmarks)):
                hand_landmarks = results_hands.multi_hand_landmarks[i]
                x_coordinates = np.array([[landmark.x] for landmark in hand_landmarks.landmark])
                y_coordinates = np.array([[landmark.y] for landmark in hand_landmarks.landmark])
                min_x = np.min(x_coordinates); max_x = np.max(x_coordinates); min_y = np.min(y_coordinates); max_y = np.max(y_coordinates)
                text_x = int(min_x * width); text_y = int(min_y * height) - MARGIN
                min_x = int(min_x * width) - MARGIN; max_x = int(max_x * width) + MARGIN
                min_y = int(min_y * height) - MARGIN; max_y = int(max_y * height) + MARGIN
                cv2.rectangle(image, (min_x, min_y), (max_x, max_y), (0, 255, 0), 2)
                cv2.putText(image, "Hand detected", (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX, 1, (88, 205, 54), 1, cv2.LINE_AA)
    return image

In [4]:
def Cellphone_predictor(image, annotated_image):
    cell_phone_results = cellphone_model.predict(image, conf=0.2, device=0, classes= 67, max_det=1)  
    for r in cell_phone_results:
        annotator = Annotator(annotated_image)
        boxes = r.boxes
        for box in boxes:
            b = box.xyxy[0]  # get box coordinates in (left, top, right, bottom) format
            c = box.cls
            annotator.box_label(b, cellphone_model.names[int(c)])
    annotated_image = annotator.result()
    return annotated_image

In [5]:
video = cv2.VideoCapture(0)
frame_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_rate = video.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_video = cv2.VideoWriter(f'live3.mp4', fourcc, frame_rate, (frame_width, frame_height))
frame_no = 0
timeout = time.time() + 60
while video.isOpened():
    ret, image = video.read()
    if not ret:
        print("Video is over")
        break
    if cv2.waitKey(1) & 0xFF == ord('q'): 
        break
    if time.time() > timeout:
        break
    frame_no += 1
    image = cv2.flip(image, 1)
    annotated_image = Hand_detector(image)
    annotated_image = Cellphone_predictor(image, annotated_image)
    cv2.imshow("Live", annotated_image)
    output_video.write(annotated_image)
video.release()
output_video.release()
cv2.destroyAllWindows()
print(frame_no)


0: 480x640 (no detections), 190.4ms
Speed: 4.5ms preprocess, 190.4ms inference, 36.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 5.7ms
Speed: 3.0ms preprocess, 5.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 5.4ms
Speed: 2.4ms preprocess, 5.4ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 7.6ms
Speed: 0.9ms preprocess, 7.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 15.0ms
Speed: 2.0ms preprocess, 15.0ms inference, 2.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 11.8ms
Speed: 0.0ms preprocess, 11.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 11.1ms
Speed: 0.0ms preprocess, 11.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 7.4ms
Speed: 1.0ms preprocess, 7.4ms infere

467


In [6]:
frame_no

467

In [10]:
import cv2
import mediapipe as mp
# For webcam input:
cap = cv2.VideoCapture(0)
index = 0
while cap.isOpened():
  success, image = cap.read()
  if not success:
    break
  with mp.solutions.hands.Hands(model_complexity=0, min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    # To improve performance, optionally mark the image as not writeable to pass by reference.
    image.flags.writeable = False
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image)
    # Draw the hand annotations on the image.
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    if results.multi_hand_landmarks:
      for hand_landmarks in results.multi_hand_landmarks:
        mp.solutions.drawing_utils.draw_landmarks(
            image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            mp.solutions.drawing_styles.get_default_hand_landmarks_style(),
            mp.solutions.drawing_styles.get_default_hand_connections_style())
    # Flip the image horizontally for a selfie-view display.
    cv2.imshow('MediaPipe Hands', cv2.flip(image, 1))
    index += 1
    if cv2.waitKey(5) & 0xFF == 27:
      break
cap.release()
cv2.destroyAllWindows()

In [9]:
index

770

In [7]:
hand_landmarks

landmark {
  x: 0.21603435277938843
  y: 0.8123367428779602
  z: 2.970191870232952e-09
}
landmark {
  x: 0.28310590982437134
  y: 0.7474789619445801
  z: -0.00993076991289854
}
landmark {
  x: 0.3358166813850403
  y: 0.6385415196418762
  z: -0.028876740485429764
}
landmark {
  x: 0.3607523441314697
  y: 0.5558997392654419
  z: -0.054721999913454056
}
landmark {
  x: 0.3649657964706421
  y: 0.48662590980529785
  z: -0.08116442710161209
}
landmark {
  x: 0.27100589871406555
  y: 0.5281548500061035
  z: -0.017298365011811256
}
landmark {
  x: 0.3003007471561432
  y: 0.46127185225486755
  z: -0.053502582013607025
}
landmark {
  x: 0.33891406655311584
  y: 0.45266544818878174
  z: -0.07519225031137466
}
landmark {
  x: 0.37056729197502136
  y: 0.46306687593460083
  z: -0.08923202008008957
}
landmark {
  x: 0.2264578640460968
  y: 0.5355290174484253
  z: -0.042621124535799026
}
landmark {
  x: 0.2677033841609955
  y: 0.4610220491886139
  z: -0.08758321404457092
}
landmark {
  x: 0.3186665177

In [7]:
# 355 no mobile detection in 60 sec
# 355 no mobile detection in 60 sec in np arrays
# 578 no hands detection in 60 sec
# 683 no hands detection in 60 sec in np arrays
# 897 no hands and mobile detection in 60 sec
# 1201 no hands and mobile detection in 60 sec in numpy
# 1044 no display nothing except head orientation and face 
# 1597 only cellphone detector

 Calibration

each 5 secs

Eye Open

Eye Close

Eye 80% closed

Take average value of all these frames for each category and plot the graphs, take the 80% closed as the cutoff
