In [1]:
import cv2
import time
import numpy
import mediapipe as mp
import numpy as np
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
from comtypes import CLSCTX_ALL
from ctypes import POINTER, cast

In [7]:
# some configs
# camera
cam_width, cam_height = 1280, 880
# display
PINK = (255, 0, 255)
RED = (0, 0, 255)
GREEN = (0, 255, 0)
font = cv2.FONT_HERSHEY_SIMPLEX
point_radius = 8
line_thickness = 8
# finger range
thumb_idx_dist_range = (50, 250)
lock_dist_lim = 20

In [3]:
# create volume interface
devices = AudioUtilities.GetSpeakers()
audio_interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(audio_interface, POINTER(IAudioEndpointVolume))
min_vol, max_vol, _ = volume.GetVolumeRange()

In [4]:
# create hand landmarks detector
hand_detector = mp.solutions.hands.Hands(
                    model_complexity=1, 
                    min_detection_confidence=0.5, 
                    min_tracking_confidence=0.5,
                    static_image_mode=False,
                    max_num_hands=1)

In [5]:
def get_2d_dist(landmark1, landmark2):
    x1, y1 = landmark1
    x2, y2 = landmark2
    return np.sqrt(np.square(x2-x1) + np.square(y2-y1))

In [None]:
def get_3d_dist(landmark1, landmark2):
    x1, y1, z1 = landmark1
    x2, y2, z2 = landmark2
    return np.sqrt(np.square(x2-x1) + np.square(y2-y1) + np.square(z2-z1))

In [8]:
landmarks_list = []
volume_locked = True
prev_dist = 0
# setup input stream, in this case, webcam
stream = cv2.VideoCapture(0)
stream.set(3, cam_width)
stream.set(4, cam_height)

# Main loop
while True:
    prev_img_load_time = time.time()
    ret, img = stream.read()
    # Ignore fail read image
    if ret is False:
        continue
    # Process
    # optional, you can just flip your camera
    # img = cv2.flip(img, 0) 
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = hand_detector.process(img)
    # Draw landmarks
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    landmarks_list.clear()
    if results.multi_hand_landmarks:
        for id, landmarks in enumerate(results.multi_hand_landmarks[0].landmark):
            if id in [4, 8, 10, 12]: # ID of thumb, index, middle fingertip
                x = int(landmarks.x*cam_width)
                y = int(landmarks.y*cam_height + 30)
                landmarks_list.append([x, y])
        '''
            Due to the order, landmarks_list should have len of 4, in which
                                landmarks_list[0] is thumb's tip
                                landmarks_list[1] is index finger's tip
                                landmarks_list[2] is ring finger's tip
                                landmarks_list[3] is ring finger's first neck
        '''
        thumb_idx_dist = get_2d_dist(landmarks_list[0], landmarks_list[1])
        lock_dist = get_2d_dist(landmarks_list[2], landmarks_list[3])
        # switch the lock's state if middle finger touched index finger and the touch speed reach a threshold
        if lock_dist < lock_dist_lim and np.abs(lock_dist - prev_dist) > 30:
            volume_locked = not volume_locked
        # Only allow adjust volume if lock is off
        if not volume_locked:
            volume_level = np.interp(thumb_idx_dist, thumb_idx_dist_range, (min_vol, max_vol))
            volume.SetMasterVolumeLevel(volume_level, None)
            cv2.putText(img, f'Volume: {volume_level}', (30, 150), font, 0.5, PINK)
        prev_dist = lock_dist
        cv2.putText(img, f'Volume lock: {volume_locked}', (30, 120), font, 0.5, color=RED if volume_locked else GREEN)
        cv2.putText(img, f'Thumb Index Dist: {round(thumb_idx_dist, 2)}', (30, 60), font, 0.5, PINK)
        cv2.putText(img, f'Mid lock Dist: {round(lock_dist, 2)}', (30, 90), font, 0.5, PINK)
        for landmark in landmarks_list:
            cv2.circle(img, landmark, point_radius, RED, cv2.FILLED)
        # cv2.circle(img, landmarks_list[0], point_radius, RED, cv2.FILLED)
        # cv2.circle(img, landmarks_list[1], point_radius, RED, cv2.FILLED)
        cv2.line(img, landmarks_list[0], landmarks_list[1], GREEN, line_thickness)
    # Draw FPS
    fps = int(1/(time.time() - prev_img_load_time))
    cv2.putText(img, f'FPS : {fps}', (30, 30), font, 0.5, PINK)
    # Show the frame
    cv2.imshow('Hand Detector', img)
    if cv2.waitKey(1) == ord('q'):
        break

stream.release()
cv2.destroyAllWindows()

In [None]:
class VolumeAdjuster:
    def __init__(self):
        self.volume = None
        self.stream = None
        self.hand_detector = mp.solutions.hands.Hands(
                                model_complexity=1, 
                                min_detection_confidence=0.5, 
                                min_tracking_confidence=0.5,
                                static_image_mode=False,
                                max_num_hands=1)
        # display
        self.colors = {
            'pink' : (255, 0, 255),
            'red' : (0, 0, 255),
            'green' : (0, 255, 0)
        }
        self.font = cv2.FONT_HERSHEY_SIMPLEX
        self.point_radius = 8
        self.line_thickness = 8
        # finger range
        self.thumb_idx_dist_range = (50, 300)
        self.lock_dist_lim = 1
        # attr for hand detector
        self.landmarks_list = []
        self.volume_locked = True
        self.prev_dist = 0

    def setup(self):
        # audio setup
        devices = AudioUtilities.GetSpeakers()
        audio_interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
        self.volume = cast(audio_interface, POINTER(IAudioEndpointVolume))
        min_vol, max_vol, _ = self.volume.GetVolumeRange()
        self.vol_range = (min_vol, max_vol)
        # camera setup
        self.stream = cv2.VideoCapture(0)
        self.cam_size = (1280, 880)
        self.stream.set(3, self.cam_size[0])
        self.stream.set(4, self.cam_size[0])

    def run(self):
        while True:
            prev_img_load_time = time.time()
            ret, img = self.stream.read()
            # Ignore fail read frame
            if ret is False:
                continue
            # Process
            # optional, you can just flip your camera
            # img = cv2.flip(img, 0) 
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            results = self.hand_detector.process(img)
            # Draw landmarks
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            self.landmarks_list.clear()
            if results.multi_hand_landmarks:
                for id, landmarks in enumerate(results.multi_hand_landmarks[0].landmark):
                    if id in [4, 8, 10, 12]: # ID of thumb, index, middle fingertip
                        x = int(landmarks.x*cam_width)
                        y = int(landmarks.y*cam_height + 30)
                        self.landmarks_list.append([x, y])
                '''
                    Due to the order, landmarks_list should have len of 4, in which
                                        landmarks_list[0] is thumb's tip
                                        landmarks_list[1] is index finger's tip
                                        landmarks_list[2] is ring finger's tip
                                        landmarks_list[3] is ring finger's first neck
                '''
                thumb_idx_dist = get_2d_dist(landmarks_list[0], landmarks_list[1])
                lock_dist = get_2d_dist(landmarks_list[2], landmarks_list[3])
                # switch the lock's state if middle finger touched index finger and the touch speed reach a threshold
                if lock_dist < self.lock_dist_lim and np.abs(lock_dist - prev_dist) > 30:
                    self.volume_locked = not self.volume_locked
                # Only allow adjust volume if lock is off
                if not self.volume_locked:
                    volume_level = np.interp(thumb_idx_dist, thumb_idx_dist_range, self.vol_range)
                    self.volume.SetMasterVolumeLevel(volume_level, None)
                    cv2.putText(img, f'Volume: {volume_level}', (30, 60), font, 0.5, self.colors['pink'])
                prev_dist = lock_dist
                cv2.putText(img, f'Volume lock: {volume_locked}', (30, 90), font, 0.5, color=self.colors['red'] if volume_locked else self.colors['green'])
                # cv2.putText(img, f'Thumb Index Dist: {round(thumb_idx_dist, 2)}', (30, 60), font, 0.5, PINK)
                # cv2.putText(img, f'Mid Index Dist: {round(lock_dist, 2)}', (30, 90), font, 0.5, PINK)
                cv2.circle(img, landmarks_list[0], point_radius, RED, cv2.FILLED)
                cv2.circle(img, landmarks_list[1], point_radius, RED, cv2.FILLED)
                cv2.line(img, landmarks_list[0], landmarks_list[1], GREEN, line_thickness)
            # Draw FPS
            fps = int(1/(time.time() - prev_img_load_time))
            cv2.putText(img, f'FPS : {fps}', (30, 30), font, 0.5, PINK)
            # Show the frame
            cv2.imshow('frame', img)
            if cv2.waitKey(1) == ord('q'):
                break

    def stop(self):
        self.stream.release()
        cv2.destroyAllWindows()