In [8]:
import os, time, cv2, dlib, numpy as np
from collections import deque, Counter
from IPython.display import display, clear_output
from PIL import Image
from pathlib import Path

import random, subprocess, threading

"""
Here we point to the dlib landmark model file.
Before we do anything else, check that the landmark model file actually exists.
If it doesn't, stop immediately and tell the user what’s missing.
"""

BASE_DIR = Path.cwd()
PREDICTOR_PATH = BASE_DIR / "models" / "shape_predictor_68_face_landmarks.dat"

assert PREDICTOR_PATH.exists(), (
    "Model file not found. Please download "
    "shape_predictor_68_face_landmarks.dat and place it in the models/ folder."
)

"""
Here we create:
1) a face detector (finds faces in an image)
2) a landmark predictor (returns 68 landmark points for a detected face)
"""
detector  = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(str(PREDICTOR_PATH))

"""
Here we store the landmark indices we care about.
Dlib’s 68-point scheme is fixed, so these indexes map to specific facial regions.
"""

IDX = {
    "mouth_left": 48, "mouth_right": 54,
    "mouth_in_top": 62, "mouth_in_bot": 66,
    "eye_L_outer": 36, "eye_R_outer": 45,
    "eye_L": [36,37,38,39,40,41],
    "eye_R": [42,43,44,45,46,47],
    "brow_L": [17,18,19,20,21],
    "brow_R": [22,23,24,25,26],
}

# --- helpers ---
def shape_to_np(shape):
    #Here we convert dlib’s landmark object into a simple array. That makes it easier to do vector math.
    return np.array([(p.x, p.y) for p in shape.parts()], dtype=np.float32)

def dist(a, b):
    # Here we compute Euclidean distance between two 2D points.
    return float(np.linalg.norm(a - b))

def center(pts):
    # Here we get the average position of a group of points (useful for “center of mouth”, for instance)
    return np.mean(pts, axis=0)

def eye_ear(p):
    # Here we compute the Eye Aspect Ratio (EAR).
    # EAR goes down when the eye closes and goes up when the eye opens wide.
    # Formula uses two vertical distances divided by the horizontal distance.
    A = dist(p[1], p[5]); B = dist(p[2], p[4]); C = dist(p[0], p[3])
    return (A + B) / (2.0*C + 1e-6)

# Here we define colors per emotion label
COL = {
    "neutral":   (255,255,255),
    "happy":     (0,255,0),
    "surprised": (0,255,255),
    "sad":       (255,255,0),
    "angry":     (0,0,255),
}

# Here we keep a small pool of possible lines for each label.
# The app picks one randomly so it doesn’t repeat the same sentence every time.
REACTIONS = {
    "happy": [
        "That smile looks good on you.",
        "You look happy right now.",
        "Someone's in a good mood.",
        "Love that energy."
    ],
    "sad": [
        "Oh, you seem sad.",
        "You look a little down. You okay?",
        "You look extra sad. Has anything happened?",
        "That face says I need a hug."
    ],
    "angry": [
        "Whoa, that's an angry face.",
        "You look annoyed. Should I back away?",
        "Okay, okay, I get it. You're mad.",
        "That glare is intense."
    ],
    "surprised": [
        "You look so surprised.",
        "Big eyes. What happened?",
        "That was a real shock face.",
        "Whoa, did I scare you?"
    ],
    "neutral": [
        "You're chill.",
        "Calm mode.",
        "You look focused.",
        "Very composed right now."
    ]
}

"""
Feature meanings:
mw = mouth_width
mo = mouth_open
bg = brow_gap (avg brow-eye vertical distance)
curve = mouth corners vs center (<0 smile, >0 frown)
eye_open = EAR (higher = eyes wider)
dbr = delta_brow: outer minus inner brow gap, positive means inner brows pulled down
"""

# Here we store the last N predicted labels, then take the most common label for smoothing.
# Smoothing reduces flicker when your face is near a threshold.
smooth_labels = deque(maxlen=12)

# Here we track when we last spoke so we don’t spam audio constantly.
# We set the minimum time between comments. The only exception: if the emotion label changes, we allow speaking sooner.
last_spoken_time = 0.0
last_spoken_label = None
cooldown_seconds = 8.0

# Here we keep all emotion thresholds in one place so they are easy to tune.
# These values were tuned to my face and may need adjustment for other users.
TH = {
    "surprise_mo": 0.10, "surprise_bg": 0.24, "surprise_ear": 0.26,
    "angry_bg": 0.24, "angry_ear": 0.32, "angry_mo": 0.10, "angry_dbr": 0.01,
    "happy_curve": -0.05, "happy_mw": 0.60, "happy_mo": 0.28,
    "sad_curve": 0.001, "sad_mw": 0.45, "sad_mo": 0.10,
}


def label_from_feats(mw, mo, bg, curve, eye_open, dbr):
    # Here we map face features -> an emotion label using simple rules.

    # surprised
    # wide mouth + brows raised + eyes wide
    if mo > TH["surprise_mo"] and bg > TH["surprise_bg"] and eye_open > TH["surprise_ear"]:
        return "surprised"
    
    # angry
    # lowered brows + narrower eyes + closed mouth
    # OR furrowed brow signal (dbr) with a closed mouth.
    if (bg < TH["angry_bg"] and eye_open < TH["angry_ear"] and mo < TH["angry_mo"]) or \
       (dbr > TH["angry_dbr"] and mo < TH["angry_mo"]):
        return "angry"
    
    # happy
    # smile curve (corners up) + wide mouth + not too open
    if curve < TH["happy_curve"] and mw > TH["happy_mw"] and mo < TH["happy_mo"]:
        return "happy"
    
    # sad
    # corners down + narrower mouth + closed mouth
    if curve > TH["sad_curve"] and mw < TH["sad_mw"] and mo < TH["sad_mo"]:
        return "sad"
    
    # neutral
    # Here we fall back to neutral if nothing matches.
    return "neutral"


def speak_async(sentence):
    # Here we speak using macOS `say` without blocking the camera loop.
    # We run `say` in a background thread so the main loop stays real-time.
    def _do_say(s):
        subprocess.call(["say", s])
    threading.Thread(target=_do_say, args=(sentence,), daemon=True).start()

def maybe_say(label):
    global last_spoken_time, last_spoken_label

    now = time.time()

    # if you dislike neutral chatter, you can uncomment this:
    # if label == "neutral":
    #     return
    
    # Here we allow speech if:
    # - enough time passed since last comment, OR
    # - the emotion label changed (so we can react quickly to a new expression).
    time_ok = (now - last_spoken_time) > cooldown_seconds

    changed_ok = (label != last_spoken_label)
    
    # Here we pick a random sentence from the pool and speak it.
    if (time_ok or changed_ok) and (label in REACTIONS):
        sentence = random.choice(REACTIONS[label])
        speak_async(sentence)
        last_spoken_time = now
        last_spoken_label = label



In [9]:
# Here we open the default camera (index 0).
# On macOS, CAP_AVFOUNDATION tends to be the most reliable backend for OpenCV.
cap = cv2.VideoCapture(0, cv2.CAP_AVFOUNDATION)
print("opened:", cap.isOpened())

try:
    # Here we do one warm-up read.
    # Some webcams return an empty/low-quality first frame while exposure/focus settles.
    ok, frame = cap.read()
    print("got first frame:", ok)

    while True:
        ok, frame = cap.read()
        if not ok:
            # Instead of breaking (which kills the loop),
            # Just skip this iteration and try again next tick
            # Sometimes mac returns a bad frame when focus changes
            time.sleep(0.01)
            continue

        # Mirror like a selfie. Purely stylistic choice.
        frame = cv2.flip(frame, 1)
        
        # Here we convert to grayscale because dlib's detector works on single-channel images.
        gray  = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Here we detect faces. The second argument is the upsample count:
        # 0 is faster, higher values can detect smaller faces but cost more CPU.
        rects = detector(gray, 0)

        for rect in rects:
            # Here we extract the bounding box and draw it for visual debugging.
            x1,y1,x2,y2 = rect.left(), rect.top(), rect.right(), rect.bottom()
            cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)

            # 68 landmarks
            pts = shape_to_np(predictor(gray, rect))

            # Normalize by distance between outer eye corners
            span = dist(pts[IDX["eye_L_outer"]], pts[IDX["eye_R_outer"]]) + 1e-6

            # Features

            # Here we compute mouth metrics:
            # - mouth_width: smile/grin tends to widen the mouth
            # - mouth_open: surprise/talking tends to increase inner lip distance
            mouth_width = dist(pts[IDX["mouth_left"]], pts[IDX["mouth_right"]]) / span
            mouth_open  = dist(pts[IDX["mouth_in_top"]], pts[IDX["mouth_in_bot"]]) / span

            # Here we compute eye openness using EAR (Eye Aspect Ratio).
            # EAR drops when the eye closes and rises when the eye opens wide.
            eyeL_pts, eyeR_pts = pts[36:42], pts[42:48]
            earL, earR = eye_ear(eyeL_pts), eye_ear(eyeR_pts)
            eye_open = 0.5*(earL + earR)

            # Here we compute approximate centers for eyes and brows.
            # We use these to estimate "brow gap" (raised brows vs lowered brows).
            eyeL_c, eyeR_c   = center(pts[IDX["eye_L"]]), center(pts[IDX["eye_R"]])
            browL_c, browR_c = center(pts[IDX["brow_L"]]), center(pts[IDX["brow_R"]])

            # Here we estimate brow raise/lower by measuring vertical distance from brow to eye.
            # Higher values typically mean "raised brows" (surprise), lower values mean "compressed brows".
            brow_gap = 0.5 * (
                abs(browL_c[1] - eyeL_c[1]) / span +
                abs(browR_c[1] - eyeR_c[1]) / span
            )

            # Here we measure "brow compression" for an anger-ish signal.
            # We compare inner brow gap vs outer brow gap:
            # - inner brow pulling down tends to shrink the inner gap more than the outer gap
            inner_gap = (abs(pts[21,1]-eyeL_c[1]) + abs(pts[22,1]-eyeR_c[1]))/(2*span)
            outer_gap = (abs(pts[17,1]-eyeL_c[1]) + abs(pts[26,1]-eyeR_c[1]))/(2*span)
            delta_brow = outer_gap - inner_gap

            # Mouth curvature: <0 smile, >0 frown
            center_y  = 0.5*(pts[51,1] + pts[57,1])
            corners_y = 0.5*(pts[48,1] + pts[54,1])
            curve     = (corners_y - center_y) / span

            # classify
            # Here we classify the current frame using the heuristic rules.
            raw = label_from_feats(
                mouth_width, mouth_open, brow_gap,
                curve, eye_open, delta_brow
            )

            # Here we smooth the output across multiple frames.
            # We store the last N labels and take the most common one to reduce flicker.
            smooth_labels.append(raw)
            lbl = Counter(smooth_labels).most_common(1)[0][0]

            # Speak if needed (async)
            maybe_say(lbl)

            # Here we draw the final label above the face box.
            cv2.putText(frame, lbl, (x1, y1-10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, COL[lbl], 2)
            
            # Here we display the live feature values so threshold tuning is easier.
            cv2.putText(
                frame,
                f"w:{mouth_width:.2f} o:{mouth_open:.2f} b:{brow_gap:.2f} ear:{eye_open:.2f} Δb:{delta_brow:.2f}",
                (10,50),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.7,
                (255,255,255),
                2
            )
            # Here we draw all 68 landmarks as small dots (visual sanity check).
            for (px,py) in pts.astype(int):
                cv2.circle(frame, (px,py), 2, (0,255,255), -1)

        # Display face count
        cv2.putText(frame, f"faces: {len(rects)}", (10,25),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,255), 2)

        # Show in notebook
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        clear_output(wait=True)
        display(Image.fromarray(rgb))

        # Tiny sleep so notebook doesn't get spammed
        time.sleep(0.005)

finally:
    # Here we always release the camera, even if the loop crashes or the notebook is interrupted.
    cap.release()
    cv2.destroyAllWindows()
    print("camera released")

camera released


KeyboardInterrupt: 