In [1]:
# This is the Distraction module in code and to run this page you need to change nothing just take and use be careful to change only the video to be saved address needs to be changed for multiple saves
# You can integrate this code with the Drosiness module but you will face difficulty with the speed in which this code runs
# The Hand detector is a bit slow and runs only like 5 - 10 frames per second on an average slowing everything try to optimize it if possible

In [2]:
# Importing libraries 

import cv2                                                # opencv for image and video applications
import time                                               # time for timed functions in the below modules
import numpy as np                                        # for all mathematical operations and array operations
import mediapipe as mp                                    # for all mediapipe landmarker functions 4
from ultralytics import YOLO                              # Importing YOLO model from ultralytics
from ultralytics.utils.plotting import Annotator          # Importing annotator from ultralytics

In [3]:
# Creating a model for cellphone using YOLOv8 nano model as it is small and gives required results in detection
cellphone_model = YOLO('yolov8n.pt') 

In [4]:
# The code for hand detection is taken from mediapipe and link https://github.com/google/mediapipe/blob/master/mediapipe/python/solutions/hands.py
# Function Hand_detector is used to detect hands one or both

# The inputs taken are the 
# image - original image frame in numpy array form

MARGIN = 10                                                                                                                                                 # MARGIN to have 

def Hand_detector(image):
    
    with mp.solutions.hands.Hands(static_image_mode=True, max_num_hands=2, model_complexity=1, min_detection_confidence=0.2, min_tracking_confidence=0.5) as hands:
        
        image.flags.writeable = False                                                                                                                       # Passing image by reference than by value this increases the accuracy of facial landmark detection
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)                                                                                                      # Converting image from BGR to RGB format
        
        results_hands = hands.process(image)                                                                                                                # Hand landmarks are detected
        
        image.flags.writeable = True                                                                                                                        # Passing image by value than by reference 
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)                                                                                                      # Converting image from RGB to BGR format

        height, width, _ = image.shape                                                                                                                      # Getting the dimensions of the image
        
        if results_hands.multi_hand_landmarks:                                                                                                              # If hands are detected
            
            for i in range(len(results_hands.multi_hand_landmarks)):                                                                                        # Based on if one hand or two hands found
                
                hand_landmarks = results_hands.multi_hand_landmarks[i]                                                                                      # taking the results of hand alndmarker detected
                x_coordinates = np.array([[landmark.x] for landmark in hand_landmarks.landmark])                                                            # getting sll normalized x coordinates
                y_coordinates = np.array([[landmark.y] for landmark in hand_landmarks.landmark])                                                            # getting sll normalized y coordinates
                
                min_x = np.min(x_coordinates); max_x = np.max(x_coordinates)                                                                                # finding minimum and maximum normalized x coordinates for rectangle construction around hand
                min_y = np.min(y_coordinates); max_y = np.max(y_coordinates)                                                                                # finding minimum and maximum normalized y coordinates for rectangle construction around hand
                
                text_x = int(min_x * width); text_y = int(min_y * height) - MARGIN                                                                          # Position for writing "Hands_detected"
                min_x = int(min_x * width) - MARGIN; max_x = int(max_x * width) + MARGIN                                                                    # extending the rectangle box a little bit
                min_y = int(min_y * height) - MARGIN; max_y = int(max_y * height) + MARGIN
                
                cv2.rectangle(image, (min_x, min_y), (max_x, max_y), (0, 255, 0), 2)                                                                        # drawing bounding box around hands detected                                   
                cv2.putText(image, "Hand detected", (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX, 1, (88, 205, 54), 1, cv2.LINE_AA)                            # Having a writing on top of the box "Hands_detected"
    
    return image

# The outputs returned is the image - original image frame in numpy array form that is annotated now

In [5]:
# Function Cellphone_predictor is the cellphone object detection function
# The inputs taken are the
# image - original image frame is passed on
# annotated_image - The image frame is annotated 

def Cellphone_predictor(image, annotated_image):

    cell_phone_results = cellphone_model.predict(image,                                                   # The YOLO model for cellphone detection the
                                                 conf=0.2,                                                # if confidence level is 20% then it is considered a cellphone,
                                                 device=0,                                                # the GPU has been activated by device = 0,
                                                 classes= 67,                                             # only a single class of cellphone is to be detected by class 67
                                                 max_det=1)                                               # and max_det=1 detect only one cellphone
                                                                                                           
    for r in cell_phone_results:                                                                          # If there is multiple cellphones they are being detected
        
        annotator = Annotator(annotated_image)                                                            # Annotator taken from ultralytics
        boxes = r.boxes                                                                           

        for box in boxes:                                                                                 # based on number of boxeds
            b = box.xyxy[0]                                                                               # get box coordinates in (left, top, right, bottom) format
            c = box.cls                                                                                   # getting the class name - mobile phone
            annotator.box_label(b, cellphone_model.names[int(c)])                                         # creating a box around the cellphone detected
    
    annotated_image = annotator.result()                                                                  # making it to be an annotated image
    
    return annotated_image                      

# The output returned is numpy array of annotated image of cellphone

In [6]:
# This cell can be used to store data of one minute where hand and cellphone detection is possible if connected with face detection much easier to have whole distraction detection based on cellphone detection
# and face turn away and other movements of hands in front of face
# Keep in mind to change the address where the video gets saved only that needs to be changed

video = cv2.VideoCapture(0)                                                                                               # Initializing video capturing through webcam or any connected camera
frame_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))                                                                    # frame width of video captured
frame_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))                                                                  # frame height of video captured
frame_rate = video.get(cv2.CAP_PROP_FPS)                                                                                  # rate at which frames are being captured for live feed                                                                                 
fourcc = cv2.VideoWriter_fourcc(*'XVID')                                                                                  # 4-character code of codec used to compress the frames
output_video = cv2.VideoWriter(f'live3.mp4', fourcc, frame_rate, (frame_width, frame_height))                             # video storage creator
frame_no = 0                                                                                                              # Indexing the number of times the loop runs
# loop for a single minute the loop breaks in a single minute
timeout = time.time() + 60
while video.isOpened() and time.time() < timeout:
    start = time.time()                                                                                                   # storing the instant of time
    ret, image = video.read()                                                                                             # Video is read frame by frame after each loop
    if not ret:                                                                                                           # If frame not detected in video loop stops
        print("Video is over")
        break
    if cv2.waitKey(1) & 0xFF == ord('q'):                                                                                 # recording for that loop stops if key 'q' is pressed in keyboard
        break 
    frame_no += 1                                                                                                         # Updating loop index
    image = cv2.flip(image, 1)                                                                                            # Flipping the image horizontally
    annotated_image = Hand_detector(image)                                                                                # Hand detection
    annotated_image = Cellphone_predictor(image, annotated_image)                                                         # Cellphone detection
    cv2.imshow("Live", annotated_image)                                                                                   # displayingannotated image
    output_video.write(annotated_image)                                                                                   # the annotated images are stored in the file
video.release()                                                                                                           # release live displayed annotated frames video
output_video.release()                                                                                                    # release storage annotated frames video
cv2.destroyAllWindows()                                                                                                   # destroys all video windows
print(frame_no)                                                                                                           # No of frames in a minute printed


0: 480x640 (no detections), 206.8ms
Speed: 18.0ms preprocess, 206.8ms inference, 96.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 12.0ms
Speed: 5.7ms preprocess, 12.0ms inference, 3.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 14.6ms
Speed: 3.5ms preprocess, 14.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 9.9ms
Speed: 8.1ms preprocess, 9.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 13.9ms
Speed: 4.3ms preprocess, 13.9ms inference, 2.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 16.9ms
Speed: 1.9ms preprocess, 16.9ms inference, 2.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 13.6ms
Speed: 4.4ms preprocess, 13.6ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 15.5ms
Speed: 0.0ms preprocess, 15.5ms

355


In [10]:
# import cv2
# import mediapipe as mp
# # For webcam input:
# cap = cv2.VideoCapture(0)
# index = 0
# while cap.isOpened():
#   success, image = cap.read()
#   if not success:
#     break
#   with mp.solutions.hands.Hands(model_complexity=0, min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
#     # To improve performance, optionally mark the image as not writeable to pass by reference.
#     image.flags.writeable = False
#     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#     results = hands.process(image)
#     # Draw the hand annotations on the image.
#     image.flags.writeable = True
#     image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
#     if results.multi_hand_landmarks:
#       for hand_landmarks in results.multi_hand_landmarks:
#         mp.solutions.drawing_utils.draw_landmarks(
#             image,
#             hand_landmarks,
#             mp.solutions.hands.HAND_CONNECTIONS,
#             mp.solutions.drawing_styles.get_default_hand_landmarks_style(),
#             mp.solutions.drawing_styles.get_default_hand_connections_style())
#     # Flip the image horizontally for a selfie-view display.
#     cv2.imshow('MediaPipe Hands', cv2.flip(image, 1))
#     index += 1
#     if cv2.waitKey(5) & 0xFF == 27:
#       break
# cap.release()
# cv2.destroyAllWindows()