In [34]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2
import os
import numpy as np
import time
from moviepy.editor import VideoFileClip, concatenate_videoclips, AudioFileClip
import whisper_timestamped as whisper
import matplotlib.pyplot as plt
import keyboard
import cv2 as cv
import mediapipe as mp
import math

In [35]:
# Load MobileNetV2 model
def load_emotion_model():
    base_model = tf.keras.applications.MobileNetV2(input_shape=(224, 224, 3),
                                                   include_top=False,
                                                   weights='imagenet')
    base_model.trainable = False

    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(7, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [36]:
new_model = load_emotion_model()

# Define image size
img_size = 224
Datadirectory = "train/"
Validationdirectory = "validation/"
Classes = ["0", "1", "2", "3", "4", "5", "6"]

In [37]:
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255.0,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)
validation_datagen = ImageDataGenerator(rescale=1.0 / 255.0)
batch_size = 32
train_generator = train_datagen.flow_from_directory(
    Datadirectory,
    target_size=(img_size, img_size),
    batch_size=batch_size,
    class_mode='sparse',
    classes=Classes,
    shuffle=True,
    subset='training'
)
validation_generator = validation_datagen.flow_from_directory(
    Validationdirectory,
    target_size=(img_size, img_size),
    batch_size=batch_size,
    class_mode='sparse',
    classes=Classes,
    shuffle=False
)

Found 28709 images belonging to 7 classes.
Found 7178 images belonging to 7 classes.


In [38]:
# Train and save the model if it doesn't exist
model_path = 'my_model.h5'
if os.path.exists(model_path):
    new_model = tf.keras.models.load_model(model_path)
else:
    model = tf.keras.applications.MobileNetV2()
    base_input = model.layers[0].input
    base_output = model.layers[-2].output
    x = tf.keras.layers.Dense(128, activation='relu')(base_output)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    output = tf.keras.layers.Dense(len(Classes), activation='softmax')(x)
    new_model = tf.keras.Model(inputs=base_input, outputs=output)
    new_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    epochs = 5
    history = new_model.fit(train_generator, epochs=epochs, validation_data=validation_generator)
    new_model.save(model_path)

In [39]:
# Function to predict confidence level
def calculate_confidence_level(emotion_percentages):
    confidence_weights = {
        "Angry": 0.4,
        "Disgust": 0.5,
        "Fear":0.2,
        "Sad": 0.3,
        "Surprise": 1,
        "Neutral": 2,
        "Happy": 2,
    }

    total_confidence = sum(emotion_percentages[emotion] * confidence_weights[emotion] for emotion in emotion_percentages)
    total_percentage = sum(emotion_percentages[emotion] for emotion in emotion_percentages)

    average_confidence = total_confidence / total_percentage

    # You might want to scale the confidence to be within a desired range
    scaled_confidence = min(max(average_confidence, 1), 2)

    return scaled_confidence

In [40]:
# Function to calculate emotion percentages and confidence level
def calculate_emotion_percentages(raw_emotion_percentages, total_duration_emotions_with_data):
    emotion_percentages = {}

    for emotion in raw_emotion_percentages.keys():
        total_duration_emotion = len(raw_emotion_percentages[emotion])
        percentage = (total_duration_emotion / total_duration_emotions_with_data) * 100
        emotion_percentages[emotion] = percentage
        print(f"{emotion}: {percentage:.2f}%")

    # Call calculate_confidence_level function
    confidence_level = calculate_confidence_level(emotion_percentages)
    print(f"Confidence Level from Emotion Percentages: {confidence_level} star(s)")

    return emotion_percentages, confidence_level

In [41]:
def process_audio_and_transcribe(video_path):
    audio_path = "myaudio.wav"
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)

    audio = whisper.load_audio(audio_path)
    model = whisper.load_model("tiny", device="cpu")

    result = whisper.transcribe(model, audio, language="en", detect_disfluencies=True)
    sum_disfluencies = sum(1 for text in result["segments"] for word in text["words"] if "[*]" in word["text"])
    if sum_disfluencies >= 10:
        result = 0
    elif sum_disfluencies >= 8:
        result = 1
    elif sum_disfluencies >= 6:
        result = 1.5
    elif sum_disfluencies >= 4:
        result = 2
    elif sum_disfluencies > 2:
        result = 2.5
    else:
        result = 3

    return result

In [42]:
mp_face_mesh = mp.solutions.face_mesh
LEFT_EYE = [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398]
RIGHT_EYE = [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246]
RIGHT_IRIS = [474, 475, 476, 477]
LEFT_IRIS = [469, 470, 471, 472]
L_H_LEFT = [33]  # right eye right most landmark
L_H_RIGHT = [133]  # right eye left most landmark
R_H_LEFT = [362]  # left eye right most landmark
R_H_RIGHT = [263]  # left eye left most landmark
TOP = 386 # top most landmark
BOTTOM = 374 # bottom most landmark


In [43]:
def euclidean_distance(point1,point2):
    x1,y1 = point1.ravel()
    x2,y2 = point2.ravel()
    distance  = math.sqrt((x2-x1)**2 + (y2-y1)**2)
    return distance

In [44]:
def iris_position(iris_center, right_point, left_point, top_point, bottom_point):
    center_to_right_dist = euclidean_distance(iris_center, right_point)
    total_dist = euclidean_distance(right_point, left_point)
    ratio_horizontal = center_to_right_dist / total_dist

    center_to_top_dist = euclidean_distance(iris_center, top_point)
    total_vertical_dist = euclidean_distance(top_point, bottom_point)
    ratio_vertical = center_to_top_dist / total_vertical_dist

    position = ""

    if ratio_horizontal <= 0.42:
        horizontal_pos = "right"
    elif ratio_horizontal > 0.42 and ratio_horizontal <= 0.57:
        horizontal_pos = "center"
    else:
        horizontal_pos = "left"

    if ratio_vertical <= 0.42:
        vertical_pos = "top"
    elif ratio_vertical > 0.42 and ratio_vertical <= 0.57:
        vertical_pos = "middle"
    else:
        vertical_pos = "bottom"

    position = f"{vertical_pos} {horizontal_pos}"
    return position, ratio_horizontal, ratio_vertical

In [45]:
def calculate_eye_position_rating(eye_position_percentages):
    bottom_total = sum([
        eye_position_percentages['left_bottom'],
        eye_position_percentages['center_bottom'],
        eye_position_percentages['right_bottom']
    ])

    if bottom_total <= 20:
        return 4
    elif 20 < bottom_total <= 30:
        return 3.5
    elif 30 < bottom_total <= 40:
        return 3
    elif 40 < bottom_total <= 50:
        return 2.5
    elif 50 < bottom_total <= 60:
        return 2
    elif 60 < bottom_total <= 70:
        return 1.5
    elif 70 < bottom_total <= 80:
        return 1
    elif 80 < bottom_total <= 90:
        return 0.5
    else:
        return 0


In [46]:
def calculate_face_position_rating(face_position_percentages):
    forward_percentage = face_position_percentages.get('forward', 0)  

    if forward_percentage >= 90:
        return 1
    elif 80 <= forward_percentage < 90:
        return 0.8
    elif 70 <= forward_percentage < 80:
        return 0.7
    elif 60 <= forward_percentage < 70:
        return 0.5
    elif 50 <= forward_percentage < 60:
        return 0.4
    elif 40 <= forward_percentage < 50:
        return 0.2
    else:
        return 0


In [47]:
# Load video
video_path = 'video.mp4'
video = VideoFileClip(video_path)

# Output files
output_frame_path = "output_frame.mp4"
output_audio_path = "output_audio.mp4"

# Video writer for output_frame.mp4
frame_writer = cv2.VideoWriter(output_frame_path, cv2.VideoWriter_fourcc(*"mp4v"), 24, (video.size[0], video.size[1]))

# Audio writer for output_audio.mp4
audio_writer = AudioFileClip(video_path)
audio_writer.write_audiofile(output_audio_path, codec="aac")

MoviePy - Writing audio in output_audio.mp4


                                                                   

MoviePy - Done.


In [48]:
# Initialize the emotion tracking variables
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5)

mp_drawing = mp.solutions.drawing_utils
drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1)
video_path = 'video.mp4'  
cap = cv.VideoCapture(video_path)
emotions = ["Angry", "Disgust", "Fear", "Happy", "Sad", "Surprise", "Neutral"]
eye_positions = {
    "left_top": 0, "left_middle": 0, "left_bottom": 0,
    "center_top": 0, "center_middle": 0, "center_bottom": 0,
    "right_top": 0, "right_middle": 0, "right_bottom": 0
}
face_position = {"left": 0,"right": 0,"top": 0,"bottom": 0,"forward": 0}

total_frames = 0

emotion_durations = {str(idx): 0 for idx in range(len(emotions))}
raw_emotion_percentages = {emotion: [] for emotion in emotions}
start_time = time.time()

with mp_face_mesh.FaceMesh(max_num_faces=1, refine_landmarks=True, min_detection_confidence=0.5, min_tracking_confidence=0.5) as face_mesh:
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Your existing frame processing logic
        gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
        faceCascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
        faces = faceCascade.detectMultiScale(gray, 1.1, 4)
        emotions_with_data = []  # Initialize the list

        for x, y, w, h in faces:
            roi_gray = gray[y:y + h, x:x + w]
            roi_color = frame[y:y + h, x:x + w]

            final_image = cv.resize(roi_color, (img_size, img_size))
            final_image = np.expand_dims(final_image, axis=0)
            final_image = final_image / 255.0

            Predictions = new_model.predict(final_image)

            raw_percentages = Predictions.flatten() * 100
            print("Raw Percentages:", raw_percentages)

            emotion_label = str(np.argmax(Predictions))
            emotions_with_data.append(emotions[int(emotion_label)])

            emotion_durations[emotion_label] += 1
            raw_emotion_percentages[emotions[int(emotion_label)]] += [raw_percentages.tolist()]

            cv.rectangle(frame, (x, y), (x + w, y + h), (0, 0, 255), 2)
            cv.putText(frame, emotions[int(emotion_label)], (x, y - 10), cv.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 2)

            # Eye position calculation within face detection loop
            frame_rgb = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
            img_h, img_w = frame.shape[:2]
            results = face_mesh.process(frame_rgb)
            mesh_points = np.array([np.multiply([p.x, p.y], [img_w, img_h]).astype(int) for p in results.multi_face_landmarks[0].landmark])
            (l_cx, l_cy), l_radius = cv.minEnclosingCircle(mesh_points[LEFT_IRIS])
            (r_cx, r_cy), r_radius = cv.minEnclosingCircle(mesh_points[RIGHT_IRIS])
            center_left = np.array([l_cx, l_cy], dtype=np.int32)
            center_right = np.array([r_cx, r_cy], dtype=np.int32)

            iris_pos, ratio_horizontal, ratio_vertical = iris_position(center_right, mesh_points[R_H_RIGHT],
                                                                       mesh_points[R_H_LEFT][0],
                                                                       mesh_points[TOP],
                                                                       mesh_points[BOTTOM])
            # Determine horizontal and vertical positions separately
            horizontal_pos, vertical_pos = iris_pos.split()  # Split the position into horizontal and vertical

            # Update counts for the corresponding eye position
            eye_position = f"{vertical_pos}_{horizontal_pos}"
            eye_positions[eye_position] += 1
            cv.putText(frame, f"Iris pos: {iris_pos} H:{ratio_horizontal:.2f} V:{ratio_vertical:.2f}", (30, 30),
                       cv.FONT_HERSHEY_PLAIN, 1.2, (0, 255, 0), 1, cv.LINE_AA)
            #face position
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = face_mesh.process(image)
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            img_h, img_w, img_c = image.shape
            face_3d = []
            face_2d = []
            if results.multi_face_landmarks:
                for face_landmarks in results.multi_face_landmarks:
                    for idx, lm in enumerate(face_landmarks.landmark):
                        if idx == 33 or idx == 263 or idx == 1 or idx == 61 or idx == 291 or idx == 199:
                            if idx == 1:
                                nose_2d = (lm.x * img_w, lm.y * img_h)
                                nose_3d = (lm.x * img_w, lm.y * img_h, lm.z * 3000)

                            x, y = int(lm.x * img_w), int(lm.y * img_h)

                            face_2d.append([x, y])
                            face_3d.append([x, y, lm.z])

                    face_2d = np.array(face_2d, dtype=np.float64)
                    face_3d = np.array(face_3d, dtype=np.float64)

                    focal_length = 1 * img_w
                    cam_matrix = np.array([[focal_length, 0, img_w / 2],
                                   [0, focal_length, img_h / 2],
                                   [0, 0, 1]])
                    dist_matrix = np.zeros((4, 1), dtype=np.float64)

                    success, rotation_vector, translation_vector = cv2.solvePnP(face_3d, face_2d, cam_matrix, dist_matrix)
                    rmat, jac = cv2.Rodrigues(rotation_vector)

                    angles, _, _, _, _, _ = cv2.RQDecomp3x3(rmat)

                    x = angles[0] * 360
                    y = angles[1] * 360
                    z = angles[2] * 360

                    if y < -10:
                        text = 'left'
                        face_position[text] += 1
                    elif y > 10:
                        text = 'right'
                        face_position[text] += 1
                    elif x < -10:
                        text = 'bottom'
                        face_position[text] += 1
                    elif x > 20:
                        text = 'top'
                        face_position[text] += 1
                    else:
                        text = 'forward'
                        face_position[text] += 1
                    cv2.putText(image, text, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

            

        # Write the annotated frame to the output video
        # frame_writer.write(frame)

        #cv.imshow('img', frame)

        total_frames += 1
        frame_writer.write(frame)

        key = cv.waitKey(1)
        if key == ord('q'):
            break



    #plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    #plt.title('Face Emotion Recognition')
    #plt.show()

# Release the video writer
frame_writer.release()
# Calculate emotion percentages and confidence level from emotions
# Calculate emotion percentages and confidence level from emotions
total_duration_emotions_with_data = sum(len(percentages) for percentages in raw_emotion_percentages.values())

# Check if total_duration_emotions_with_data is greater than zero before calculating emotion percentages
if total_duration_emotions_with_data > 0:
    emotion_percentages, confidence_level_from_emotions = calculate_emotion_percentages(raw_emotion_percentages, total_duration_emotions_with_data)
    print(f"Confidence Level from Emotions: {confidence_level_from_emotions} star(s)")
else:
    print("No face detections or emotion predictions in the video frames.")


# Process audio and transcribe, count disfluencies
sum_disfluencies = process_audio_and_transcribe(video_path)
print(f"Total Disfluencies percentage: {sum_disfluencies}")

# Calculate overall confidence level
overall_confidence_level = (confidence_level_from_emotions) 
#overall_confidence_level+= sum_disfluencies
print(f"Confidence Level from emotions: {overall_confidence_level} star(s)")
# Calculate percentages of eye positions
total_frames = max(total_frames, 1)  # Avoid division by zero
eye_position_percentages = {pos: count / total_frames * 100 for pos, count in eye_positions.items()}
eye_position_rating = calculate_eye_position_rating(eye_position_percentages)
print("Eye Position Rating:", eye_position_rating)
face_position_percentages = {pos: count / total_frames * 100 for pos, count in face_position.items()}
face_position_rating = calculate_face_position_rating(face_position_percentages)
print("Face Position Percentages:", face_position_rating)
total_confidence = overall_confidence_level + eye_position_rating + face_position_rating + sum_disfluencies
print(f"Total Confidence: {total_confidence} star(s)")

Raw Percentages: [9.8277618e+01 7.9311867e-04 1.2390069e-03 3.7619171e-05 1.8822697e-01
 5.9902892e-03 1.5260866e+00]
Raw Percentages: [9.7189362e+01 3.0687982e-03 5.7438185e-04 1.4397948e-05 3.2954401e-01
 6.4939698e-03 2.4709466e+00]
Raw Percentages: [9.8234772e+01 9.0663054e-04 1.8737369e-03 4.4775374e-05 2.1030878e-01
 8.3402814e-03 1.5437610e+00]
Raw Percentages: [9.8796890e+01 7.1680581e-04 8.6934754e-04 3.2165073e-05 1.3298231e-01
 5.6722537e-03 1.0628251e+00]
Raw Percentages: [9.8978386e+01 3.2446423e-04 8.2190143e-04 1.6199634e-05 2.0848645e-01
 2.7916215e-03 8.0916953e-01]
Raw Percentages: [9.8906448e+01 3.8333843e-04 9.9183340e-04 4.3039661e-05 2.0493977e-01
 2.5760862e-03 8.8462460e-01]
Raw Percentages: [9.8594681e+01 3.5478460e-04 7.0046738e-04 2.0292422e-05 4.4328463e-01
 1.6747215e-03 9.5928067e-01]
Raw Percentages: [9.8921318e+01 3.4967982e-04 6.2681618e-04 2.3239392e-05 1.8667217e-01
 1.9676855e-03 8.8904166e-01]
Raw Percentages: [9.8963455e+01 4.6059134e-04 2.8772320e

                                                                   

MoviePy - Done.


100%|██████████| 2289/2289 [00:07<00:00, 293.70frames/s]

Total Disfluencies percentage: 2
Confidence Level from emotions: 1 star(s)
Eye Position Rating: 1.5
Face Position Percentages: 1
Total Confidence: 5.5 star(s)





In [49]:
# Load video clips
video_clip = VideoFileClip(video_path)
frame_clip = VideoFileClip(output_frame_path)
audio_clip = AudioFileClip(output_audio_path)

# Set the audio for the video with frames
video_clip = video_clip.set_audio(audio_clip)

# Concatenate the video clips
final_clip = concatenate_videoclips([video_clip, frame_clip])

# Write the final output
final_clip.write_videofile("output_combined.mp4", codec="libx264", audio_codec="aac", fps=24)

Moviepy - Building video output_combined.mp4.
MoviePy - Writing audio in output_combinedTEMP_MPY_wvf_snd.mp4


                                                                   

MoviePy - Done.
Moviepy - Writing video output_combined.mp4



                                                                

Moviepy - Done !
Moviepy - video ready output_combined.mp4
