In [6]:
import os
import cv2
import torch
import numpy as np
from facenet_pytorch import MTCNN, InceptionResnetV1
from PIL import Image
from torchvision import transforms

# Check if GPU is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load pre-trained FaceNet model for face recognition
facenet = InceptionResnetV1(pretrained='vggface2').eval().to(device)

# Load pre-trained MTCNN model for face detection
mtcnn = MTCNN(keep_all=True, device=device)

# Transformation for FaceNet
transform = transforms.Compose([
    transforms.Resize((160, 160)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def preprocess_image(img):
    img = img.convert("RGB")
    return transform(img).unsqueeze(0).to(device)

def extract_facenet_embedding(img):
    inputs = preprocess_image(img)
    with torch.no_grad():
        embedding = facenet(inputs)
    return embedding.cpu().numpy().flatten()

# Load images from "face" folder and extract FaceNet embeddings
face_folder = "face"
known_face_encodings = []
known_face_names = []

for filename in os.listdir(face_folder):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        img_path = os.path.join(face_folder, filename)
        img = Image.open(img_path)
        face_encoding = extract_facenet_embedding(img)
        known_face_encodings.append(face_encoding)
        known_face_names.append(filename)

def detect_faces_and_recognize(frame):
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    boxes, _ = mtcnn.detect(rgb_frame)
    
    face_names = []
    if boxes is not None:
        for box in boxes:
            (startX, startY, endX, endY) = box.astype(int)
            face_image = rgb_frame[startY:endY, startX:endX]
            face_image_pil = Image.fromarray(face_image).resize((160, 160))
            face_encoding = extract_facenet_embedding(face_image_pil)

            name = "Unknown"
            distances = np.linalg.norm(known_face_encodings - face_encoding, axis=1)
            if len(distances) > 0:
                min_distance_index = np.argmin(distances)
                if distances[min_distance_index] < 0.6:  # Adjust threshold as needed
                    name = known_face_names[min_distance_index]
            face_names.append((box, name))
    return face_names

input_video_path = "C:/Users/User/Documents/Face/A Special Message For Goli - Taarak Mehta Ka Ooltah Chashmah - Full Episode - Ep 3899 - 11 Oct 2023.mp4"
output_video_path = "output_video_with_faces.mp4"
output_video_25_path = "output_video_25.mp4"
output_video_50_path = "output_video_50.mp4"

cap = cv2.VideoCapture(input_video_path)
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

out = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc('M','J','P','G'), fps, (frame_width, frame_height))
out_25 = cv2.VideoWriter(output_video_25_path, cv2.VideoWriter_fourcc('M','J','P','G'), fps, (frame_width, frame_height))
out_50 = cv2.VideoWriter(output_video_50_path, cv2.VideoWriter_fourcc('M','J','P','G'), fps, (frame_width, frame_height))

frame_count = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    if frame_count % 5 != 0:  # Skip frames to reduce processing load
        continue

    # Print progress
    if frame_count % (total_frames // 100) == 0:
        print(f"Processing: {frame_count / total_frames:.2%} done")

    face_names = detect_faces_and_recognize(frame)
    
    for (box, name) in face_names:
        (startX, startY, endX, endY) = box.astype(int)
        if name != "Unknown":
            cv2.rectangle(frame, (startX, startY), (endX, endY), (0, 255, 0), 2)
            cv2.putText(frame, name, (startX, startY - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)

    out.write(frame)
    if frame_count <= total_frames // 4:
        out_25.write(frame)
    if frame_count <= total_frames // 2:
        out_50.write(frame)

    # Save intermediate output at 25% and 50%
    if frame_count == total_frames // 4:
        if out_25 is not None:
            out_25.release()
            out_25 = None
            print("25% of the video processed. Intermediate video saved.")
    if frame_count == total_frames // 2:
        if out_50 is not None:
            out_50.release()
            out_50 = None
            print("50% of the video processed. Intermediate video saved.")

cap.release()
out.release()
cv2.destroyAllWindows()
print("100% of the video processed. Final video saved.")


  0%|          | 0.00/107M [00:00<?, ?B/s]

Processing: 4.99% done
Processing: 9.98% done
Processing: 14.96% done
Processing: 19.95% done
Processing: 24.94% done
Processing: 29.93% done
Processing: 34.92% done
Processing: 39.90% done
Processing: 44.89% done
Processing: 49.88% done
Processing: 54.87% done
Processing: 59.86% done
Processing: 64.84% done
Processing: 69.83% done
Processing: 74.82% done
Processing: 79.81% done
Processing: 84.79% done
Processing: 89.78% done
Processing: 94.77% done
Processing: 99.76% done
100% of the video processed. Final video saved.


In [32]:
import os
import cv2
import torch
import numpy as np
from torchvision import models, transforms
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity

# Check if GPU is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load pre-trained VGG19 model for feature extraction
vgg19 = models.vgg19(weights=models.VGG19_Weights.IMAGENET1K_V1).features.to(device).eval()

# Transformation for VGG19
vgg_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def preprocess_image_vgg(img):
    img = img.convert("RGB")
    return vgg_transform(img).unsqueeze(0).to(device)

def extract_vgg19_features(img):
    inputs = preprocess_image_vgg(img)
    with torch.no_grad():
        features = vgg19(inputs)
    return features.cpu().numpy().flatten()

def detect_faces(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    return faces

# Load images from "faci" folder and extract VGG19 embeddings
faci_folder = "faci"
known_face_encodings = []
known_face_names = []

for filename in os.listdir(faci_folder):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        img_path = os.path.join(faci_folder, filename)
        img = Image.open(img_path)
        face_encoding = extract_vgg19_features(img)
        known_face_encodings.append(face_encoding)
        known_face_names.append(filename)

def match_face(face_encoding, known_face_encodings, threshold=0.78):
    similarities = cosine_similarity([face_encoding], known_face_encodings)
    max_similarity_index = np.argmax(similarities)
    return similarities, max_similarity_index, similarities[0, max_similarity_index] > threshold

input_video_path = "video.mp4"
output_video_path = "output_video.mp4"

cap = cv2.VideoCapture(input_video_path)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

frame_count = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1

    # Detect faces
    faces = detect_faces(frame)
    
    if len(faces) > 0:
        matches_found = False
        print(f"Faces found in frame {frame_count}: {len(faces)}")
        
        for (x, y, w, h) in faces:
            face = frame[y:y+h, x:x+w]
            face_pil = Image.fromarray(cv2.cvtColor(face, cv2.COLOR_BGR2RGB)).resize((224, 224))
            face_encoding = extract_vgg19_features(face_pil)

            similarities, max_similarity_index, is_match = match_face(face_encoding, np.array(known_face_encodings))
            print(f"Feature similarities: {similarities}")
            print(f"Closest match index: {max_similarity_index}, Similarity: {similarities[0, max_similarity_index]}")

            if is_match:
                matches_found = True
                print(f"Match found for {known_face_names[max_similarity_index]} in frame {frame_count}")
                out.write(frame)  # Write the frame to the output video

        if not matches_found:
            print(f"No matches found in frame {frame_count}")
    else:
        print(f"No faces found in frame {frame_count}")

cap.release()
out.release()
print(f"Trimmed video saved as {output_video_path}")


Faces found in frame 1: 1
Feature similarities: [[0.27913696 0.2819138  0.27046984 0.26780456 0.27205536 0.3736211
  0.36383042 0.31601077 0.2615561  0.29567403 0.28593904 0.27747852
  0.2739157  0.27632213 0.2806271  0.2949504  0.28132322 0.27697328
  0.2851987  0.28639337 0.33479935 0.3978713  0.31601077 0.27747852
  0.28132322]]
Closest match index: 21, Similarity: 0.3978712856769562
No matches found in frame 1
Faces found in frame 2: 1
Feature similarities: [[0.24776015 0.25405496 0.23191759 0.23612726 0.23670548 0.3715729
  0.37960324 0.30594626 0.2257971  0.27320352 0.25974083 0.24030891
  0.2387128  0.24967657 0.24910931 0.26611197 0.23832844 0.23127164
  0.23492064 0.23876901 0.29595247 0.35087955 0.30594626 0.24030891
  0.23832844]]
Closest match index: 6, Similarity: 0.37960323691368103
No matches found in frame 2
Faces found in frame 3: 1
Feature similarities: [[0.22839874 0.2296932  0.20682198 0.21298906 0.21529554 0.33735406
  0.35197333 0.27623287 0.20623393 0.25113562 0.