In [1]:
import cv2
import numpy as np
import math
import mediapipe as mp

In [2]:
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume

In [3]:
devices = AudioUtilities.GetSpeakers()

interface = devices.Activate(
                              IAudioEndpointVolume._iid_ , 
                              CLSCTX_ALL , 
                              None
                             )

volume = cast(interface, POINTER(IAudioEndpointVolume))

In [4]:
# Vol Range from -65 t0 0

vol_Range = volume.GetVolumeRange()
minV = vol_Range[0]
maxV = vol_Range[1]

In [5]:
# volume.GetMute()
# volume.GetMasterVolumeLevel()

In [6]:
mpHands = mp.solutions.hands
mpDraw = mp.solutions.drawing_utils

In [7]:
hands = mpHands.Hands(
                       static_image_mode=False,  
                       max_num_hands=2,
                       min_detection_confidence = 0.7 ,
                       min_tracking_confidence = 0.5 
                     )

In [8]:
def findHands(img ,draw=True):
    imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = hands.process(imgRGB)
    
    if results.multi_hand_landmarks:
        for handLms in results.multi_hand_landmarks:
            if draw:
                mpDraw.draw_landmarks(img, handLms,mpHands.HAND_CONNECTIONS)
    
    return img,results

In [9]:
def findPosition(img ,results,draw=True):
    xList = []
    yList = []

    lmList = []
    
    h, w = img.shape[:2]
    
    if results.multi_hand_landmarks:
        myHand = results.multi_hand_landmarks[0]
        
        for id, lm in enumerate(myHand.landmark):
            cx, cy = int(lm.x*w), int(lm.y*h)

            xList.append(cx)
            yList.append(cy)

            lmList.append([id, cx, cy])

        xmin, xmax = min(xList), max(xList)
        ymin, ymax = min(yList), max(yList)

    return lmList

In [10]:
cap = cv2.VideoCapture(0)

cap.set(3,1080)
cap.set(4,1080)

vol = 0
volBar = 400
volPer = 0

while True:
    _,img = cap.read()
    
    img,results = findHands(img)
    
    lmList = findPosition(img ,results)
    if len(lmList)!=0:
        # thumb tip
        x1,y1 = lmList[4][1] , lmList[4][2]
        # finger tip
        x2,y2 = lmList[8][1] , lmList[8][2]
        
        # center of line 
        cx,cy = (x1+x2)//2 ,(y1+y2)//2
        
        cv2.circle(img ,(x1,y1) ,5 ,(0,0,0) ,cv2.FILLED)
        cv2.circle(img ,(x2,y2) ,5 ,(0,0,0) ,cv2.FILLED)
        cv2.circle(img ,(cx,cy) ,5 ,(0,0,0) ,cv2.FILLED)
        
        cv2.line(img ,(x1,y1) ,(x2,y2) ,(230,85,73) ,3)
        
        length = math.hypot(x2-x1,y2-y1)
        # from above we see that our hand range is b/w 50-300
        
        
        # Hand Range   50-300
        # Volume Range -65 - 0
        # we need to convert Hand Range into Volume Range
        vol = np.interp(length ,[50,300] ,[minV,maxV])
        volBar = np.interp(length ,[50,300] ,[400,150])
        volPer = np.interp(length ,[50,300] ,[0,100])
        volume.SetMasterVolumeLevel(vol, None)
        
        if length<50:
            cv2.circle(img ,(cx,cy) ,5 ,(0,255,0) ,cv2.FILLED)
    
    cv2.rectangle(img ,(50,150) ,(85,400) ,(0,255,0) ,3)
    cv2.rectangle(img ,(50,int(volBar)) ,(85,400) ,(0,255,0) ,cv2.FILLED)
    cv2.putText(img ,f'{int(volPer)}%' ,(40,450) ,cv2.FONT_HERSHEY_COMPLEX ,1 ,(0,255,0) ,3)
    cv2.imshow('Gesture Volume Control',img)
    if cv2.waitKey(1)&0xff == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()