In [None]:
import numpy as np
import sounddevice as sd
import cv2
import mediapipe as mp
import threading
from collections import deque
import time

SAMPLE_RATE = 44100
BUFFER_DURATION = 0.01  
SMOOTHING_WINDOW = 5
PITCH_MOVEMENT_THRESHOLD = 0.005
VOLUME_MOVEMENT_THRESHOLD = 0.02
current_freq = 440
current_volume = 0.5
stop_audio = False

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2,
                       min_detection_confidence=0.7, min_tracking_confidence=0.7)
mp_draw = mp.solutions.drawing_utils

left_hand_history = deque(maxlen=SMOOTHING_WINDOW)
right_hand_history = deque(maxlen=SMOOTHING_WINDOW)

def map_position_to_freq(x):
    return 100 + 1000 * x  

def map_position_to_volume(y):
    return 1.0 - y

def draw_indicators(frame, freq, volume, index_tip_left, index_tip_right):
    height, width, _ = frame.shape
    freq_x = int((freq - 100) / 1000 * width)
    cv2.line(frame, (freq_x, height - 30), (freq_x, height), (0, 255, 0), 4)
    cv2.putText(frame, f'Pitch: {int(freq)} Hz', (freq_x + 10, height - 35),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

    vol_y = int((1 - volume) * height)
    cv2.line(frame, (30, vol_y), (0, vol_y), (0, 0, 255), 4)
    cv2.putText(frame, f'Volume: {int(volume * 100)}%', (35, vol_y - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

    if index_tip_left:
        cx, cy = int(index_tip_left[0] * width), int(index_tip_left[1] * height)
        cv2.circle(frame, (cx, cy), 10, (0, 255, 0), -1)
    if index_tip_right:
        cx, cy = int(index_tip_right[0] * width), int(index_tip_right[1] * height)
        cv2.circle(frame, (cx, cy), 10, (0, 0, 255), -1)

def audio_loop():
    global current_freq, current_volume, stop_audio

    phase = 0  

    def callback(outdata, frames, time_info, status):
        nonlocal phase
        t = np.arange(frames) / SAMPLE_RATE

        freq_array = np.linspace(callback.last_freq, current_freq, frames)
        vol_array = np.linspace(callback.last_vol, current_volume, frames)

        wave = np.sin(2 * np.pi * freq_array * t + phase)
        wave *= vol_array

        phase += 2 * np.pi * freq_array[-1] * frames / SAMPLE_RATE
        phase = np.mod(phase, 2 * np.pi)

        outdata[:] = wave.reshape(-1, 1).astype(np.float32)

        callback.last_freq = current_freq
        callback.last_vol = current_volume

    callback.last_freq = current_freq
    callback.last_vol = current_volume

    with sd.OutputStream(callback=callback, samplerate=SAMPLE_RATE, channels=1, dtype='float32'):
        while not stop_audio:
            time.sleep(0.01)

def start_theremin():
    global current_freq, current_volume, stop_audio, left_hand_history, right_hand_history

    stop_audio = False
    left_hand_history.clear()
    right_hand_history.clear()

    audio_thread = threading.Thread(target=audio_loop)
    audio_thread.start()

    cap = cv2.VideoCapture(0)
    print("Theremin started. Left hand = pitch (green), Right hand = volume (red). Press 'q' to quit.")

    try:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            frame = cv2.flip(frame, 1)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(rgb_frame)

            index_tip_left = None
            index_tip_right = None

            if results.multi_hand_landmarks and results.multi_handedness:
                for hand_landmarks, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
                    mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    label = handedness.classification[0].label
                    index_tip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
                    x, y = index_tip.x, index_tip.y

                    if label == "Left":
                        left_hand_history.append(x)
                        smoothed_x = np.mean(left_hand_history)
                        new_freq = map_position_to_freq(smoothed_x)
                        current_freq = new_freq
                        index_tip_left = (smoothed_x, y)

                    elif label == "Right":
                        right_hand_history.append(y)
                        smoothed_y = np.mean(right_hand_history)
                        new_volume = map_position_to_volume(smoothed_y)
                        current_volume = np.clip(new_volume, 0.0, 1.0)
                        index_tip_right = (x, smoothed_y)

            draw_indicators(frame, current_freq, current_volume, index_tip_left, index_tip_right)
            cv2.imshow("Virtual Theremin", frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                stop_audio = True
                break

    finally:
        cap.release()
        cv2.destroyAllWindows()
        hands.close()
        print("Theremin stopped.")

if __name__ == "__main__":
    start_theremin()

Theremin started. Left hand = pitch (green), Right hand = volume (red). Press 'q' to quit.
Theremin stopped.
