In [1]:
!pip install mediapipe opencv-python matplotlib moviepy transformers --quiet
!wget -O face_landmarker_v2_with_blendshapes.task -q https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task

In [2]:
# 2️⃣ Imports
# ---------------------------
import cv2
import numpy as np
import torch
import json
from scipy.spatial.transform import Rotation as R
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from moviepy.editor import VideoFileClip
from openai import OpenAI
import os



In [3]:
# 3️⃣ Vision Layer
# ---------------------------
class VisionLayer:
    def __init__(self, model_path='face_landmarker_v2_with_blendshapes.task', fps=5):
        import mediapipe as mp
        from mediapipe.tasks.python import BaseOptions
        from mediapipe.tasks.python.vision import FaceLandmarker, FaceLandmarkerOptions, RunningMode

        base_options = BaseOptions(model_asset_path=model_path)
        options = FaceLandmarkerOptions(
            base_options=base_options,
            running_mode=RunningMode.IMAGE,
            num_faces=1
        )
        self.landmarker = FaceLandmarker.create_from_options(options)
        self.fps = fps
        self.model_points = np.array([
            [0.0, 0.0, 0.0],
            [0.0, -63.6, -12.5],
            [-43.3, 32.7, -26.0],
            [43.3, 32.7, -26.0],
            [-28.9, -28.9, -24.1],
            [28.9, -28.9, -24.1]
        ], dtype=np.float64)

    def sample_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        video_fps = cap.get(cv2.CAP_PROP_FPS)
        interval = max(1, int(video_fps / self.fps))
        frames, timestamps, count = [], [], 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret: break
            if count % interval == 0:
                frames.append(frame)
                timestamps.append(count / video_fps)
            count += 1
        cap.release()
        return frames, timestamps

    def get_head_pose(self, image, landmarks):
        image_points = np.array([
            [landmarks[1].x * image.shape[1], landmarks[1].y * image.shape[0]],
            [landmarks[152].x * image.shape[1], landmarks[152].y * image.shape[0]],
            [landmarks[33].x * image.shape[1], landmarks[33].y * image.shape[0]],
            [landmarks[263].x * image.shape[1], landmarks[263].y * image.shape[0]],
            [landmarks[61].x * image.shape[1], landmarks[61].y * image.shape[0]],
            [landmarks[291].x * image.shape[1], landmarks[291].y * image.shape[0]]
        ], dtype=np.float64)

        focal_length = image.shape[1]
        center = (image.shape[1]/2, image.shape[0]/2)
        camera_matrix = np.array([[focal_length,0,center[0]],
                                  [0,focal_length,center[1]],
                                  [0,0,1]], dtype=np.float64)
        dist_coeffs = np.zeros((4,1))
        success, rotation_vector, _ = cv2.solvePnP(self.model_points, image_points, camera_matrix, dist_coeffs)
        rmat, _ = cv2.Rodrigues(rotation_vector)
        sy = np.sqrt(rmat[0,0]**2 + rmat[1,0]**2)
        yaw = np.arctan2(rmat[1,0], rmat[0,0])
        pitch = np.arctan2(-rmat[2,0], sy)
        roll = np.arctan2(rmat[2,1], rmat[2,2])
        return np.degrees(yaw), np.degrees(pitch), np.degrees(roll)

    def get_gaze_vector(self, landmarks, head_pose):
        left_eye_center = np.mean([[lm.x,lm.y,lm.z] for lm in landmarks[33:42]], axis=0)
        right_eye_center = np.mean([[lm.x,lm.y,lm.z] for lm in landmarks[133:142]], axis=0)
        eye_center = (left_eye_center + right_eye_center)/2
        left_iris_center = np.mean([[lm.x,lm.y,lm.z] for lm in landmarks[468:473]], axis=0)
        right_iris_center = np.mean([[lm.x,lm.y,lm.z] for lm in landmarks[473:478]], axis=0)
        iris_center = (left_iris_center + right_iris_center)/2
        raw_gaze = iris_center - eye_center
        yaw, pitch, roll = head_pose
        r = R.from_euler('xyz',[pitch,yaw,roll], degrees=True)
        gaze_vector = r.apply(raw_gaze)
        gaze_vector /= np.linalg.norm(gaze_vector)+1e-8
        return gaze_vector

    def extract_signals(self, frames):
        head_pose_series, gaze_vectors = [], []
        for frame in frames:
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            result = self.landmarker.detect(rgb)
            if result.face_landmarks:
                face = result.face_landmarks[0]
                yaw,pitch,roll = self.get_head_pose(frame, face)
                head_pose_series.append({"yaw":yaw,"pitch":pitch,"roll":roll})
                gaze_vectors.append(self.get_gaze_vector(face,(yaw,pitch,roll)).tolist())
            else:
                head_pose_series.append({"yaw":0,"pitch":0,"roll":0})
                gaze_vectors.append([0,0,0])
        return head_pose_series, gaze_vectors

In [4]:
# 4️⃣ Temporal Aggregator
# ---------------------------
class TemporalAggregator:
    def __init__(self, window_size=5, fps=5):
        self.window_size = window_size
        self.fps = fps
    def smooth_series(self, series, key=None):
        smoothed = []
        half_w = self.window_size//2
        for i in range(len(series)):
            start,end = max(0,i-half_w), min(len(series),i+half_w+1)
            if key:
                avg={k: np.mean([series[j][k] for j in range(start,end)]) for k in series[0].keys()}
            else:
                avg = np.mean(series[start:end], axis=0).tolist()
            smoothed.append(avg)
        return smoothed
    def duration_analysis(self,h_series,g_series,timestamps):
        gaze_off_flags, head_turn_flags = [], []
        for h,g in zip(h_series,g_series):
            gaze_off_flags.append(abs(g[0])>0.4)
            head_turn_flags.append(abs(h["yaw"])>15)
        def streaks(flags):
            res=[]; start_idx=None
            for idx,val in enumerate(flags+[False]):
                if val and start_idx is None: start_idx=idx
                elif not val and start_idx is not None:
                    res.append(timestamps[idx-1]-timestamps[start_idx]+1/self.fps)
                    start_idx=None
            return res
        return {"gaze_off":streaks(gaze_off_flags),"head_turn":streaks(head_turn_flags)}
    def aggregate(self,h_series,g_series,timestamps):
        head_smoothed=self.smooth_series(h_series,key="yaw")
        gaze_smoothed=self.smooth_series(g_series)
        durations=self.duration_analysis(head_smoothed,gaze_smoothed,timestamps)
        return {"head_pose_series":head_smoothed,"gaze_series":gaze_smoothed,"durations":durations}

In [5]:
# 5️⃣ Vision Agent
# ---------------------------
class LLMVisionAgent:
    def __init__(self, model_id="microsoft/phi-3-mini-4k-instruct"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModelForCausalLM.from_pretrained(model_id,torch_dtype=torch.float16,device_map="auto")
    def analyze(self, vision_evidence):
        prompt=f"""
You are a Vision Analysis Agent for an AI Interview system.
Vision Evidence:
{json.dumps(vision_evidence, indent=2)}
Return JSON:
- focus_assessment (short text)
- confidence_level (low/medium/high)
- vision_score (0.0-1.0)
- explanation (1-2 sentences)
"""
        inputs=self.tokenizer(prompt,return_tensors="pt").to(self.model.device)
        with torch.no_grad():
            outputs=self.model.generate(**inputs,max_new_tokens=300,temperature=0.3)
        response_text=self.tokenizer.decode(outputs[0],skip_special_tokens=True)
        json_start = response_text.find("{")
        json_end = response_text.rfind("}")+1
        return json.loads(response_text[json_start:json_end])

In [6]:
# 6️⃣ Speech Transcriber + Evaluator
# ---------------------------
def extract_audio(video_path, audio_path="audio.wav"):
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path,fps=16000,codec="pcm_s16le",verbose=False,logger=None)
    return audio_path

In [7]:
class SpeechTranscriber:
    def __init__(self):
        self.asr = pipeline("automatic-speech-recognition",model="openai/whisper-large-v3",device=0)
    def transcribe(self, video_path):
        audio_path=extract_audio(video_path)
        result=self.asr(audio_path)
        return result["text"]

In [8]:
class AnswerEvaluator:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)
    def evaluate(self, transcript, question):
        transcript_clean=transcript.strip()
        prompt=f"""
You are a senior technical interview evaluation agent.
Evaluate candidate speech ONLY.
Question: "{question}"
Candidate Answer: "{transcript_clean}"
Return JSON with scores 0-10, decision, improvement_suggestions.
"""
        response=self.client.chat.completions.create(
            model="gpt-4.1",
            messages=[{"role":"system","content":"JSON only evaluator"},{"role":"user","content":prompt}],
            temperature=0,response_format={"type":"json_object"}
        )
        return response.choices[0].message.content

In [9]:
class InterviewManagerAgent:
    def __init__(self, vision_agent, speech_evaluator, transcriber, api_key):
        self.vision_agent=vision_agent
        self.speech_evaluator=speech_evaluator
        self.transcriber=transcriber
        self.api_key=api_key
    def run_interview(self,videos,questions):
        import numpy as np
        final_scores=[]; speech_issues_set=set(); per_question_details=[]
        for video,question in zip(videos,questions):
            # Vision
            vision_layer=VisionLayer(fps=5)
            frames,timestamps=vision_layer.sample_frames(video)
            head_pose,gaze=vision_layer.extract_signals(frames)
            aggregator=TemporalAggregator(window_size=5,fps=5)
            vision_evidence=aggregator.aggregate(head_pose,gaze,timestamps)
            vision_score=self.vision_agent.analyze(vision_evidence)["vision_score"]
            # Speech
            transcript=self.transcriber.transcribe(video)
            eval_json=json.loads(self.speech_evaluator.evaluate(transcript,question))
            final_scores.append(eval_json["final_average_score"])
            if eval_json["decision"]=="REJECT":
                speech_issues_set.update(eval_json["improvement_suggestions"])
            per_question_details.append({
                "video":video,"question":question,"transcript":transcript,
                "vision_score":vision_score,"evaluation":eval_json
            })
        final_avg=np.mean(final_scores)
        if final_avg>=7.0:
            report="مبروك، تم قبولك ✅"
        else:
            report="مشاكل في Speech فقط:\n- "+ "\n- ".join(sorted(speech_issues_set))
        return {"final_average_score":final_avg,"report":report,"per_question_details":per_question_details}

In [10]:
# 8️⃣ Set your API Key
# ---------------------------
OPENAI_API_KEY = "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"  # ضع مفتاحك هنا

In [11]:
# 9️⃣ Run Example
# ---------------------------
videos = ["video1.mp4","video2.mp4","video3.mp4","video4.mp4","video5.mp4"]
questions = ["Explain overfitting?","How to stabilize training?","What is regularization?","Difference between L1 & L2?","Explain dropout?"]

vision_agent=LLMVisionAgent()
transcriber=SpeechTranscriber()
speech_evaluator=AnswerEvaluator(api_key=OPENAI_API_KEY)
manager=InterviewManagerAgent(vision_agent,speech_evaluator,transcriber,OPENAI_API_KEY)

result = manager.run_interview(videos,questions)
print(json.dumps(result, indent=2, ensure_ascii=False))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Error decoding JSON: Extra data: line 9 column 1 (char 112)
Problematic JSON string: {
  "head_pose_series": [],
  "gaze_series": [],
  "durations": {
    "gaze_off": [],
    "head_turn": []
  }
}
Return JSON:
- focus_assessment (short text)
- confidence_level (low/medium/high)
- vision_score (0.0-1.0)
- explanation (1-2 sentences)

Input:
{
  "head_pose_series": [
    {"timestamp": 0.00, "yaw": 0.0, "pitch": 0.0, "roll": 0.0},
    {"timestamp": 0.50, "yaw": 5.0, "pitch": 0.0, "roll": 0.0},
    {"timestamp": 1.00, "yaw": 10.0, "pitch": 0.0, "roll": 0.0}
  ],
  "gaze_series": [
    {"timestamp": 0.00, "x": 0.5, "y": 0.5},
    {"timestamp": 0.50, "x": 0.6, "y": 0.6},
    {"timestamp": 1.00, "x": 0.7, "y": 0.7}
  ],
  "durations": {
    "gaze_off": [0.00, 0.50],
    "head_turn": [0.50, 1.00]
  }
}


JSONDecodeError: Extra data: line 9 column 1 (char 112)