In [2]:
!pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.6.0-cp39-cp39-win_amd64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.21.0-cp39-cp39-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Downloading torchaudio-2.6.0-cp39-cp39-win_amd64.whl.metadata (6.7 kB)
Collecting networkx (from torch)
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.6.0-cp39-cp39-win_amd64.whl (204.1 MB)
   ---------------------------------------- 0.0/204.1 MB ? eta -:--:--
   ---------------------------------------- 0.5/204.1 MB 4.2 MB/s eta 0:00:49
   ---------------------------------------- 1.3/204.1 MB 4.2 MB/s eta 0:00:49
   ---------------------------------------- 2.1/204.1 MB 4.9 MB/s eta 0:00:42
    --------------------------------------- 3.9/204.1 MB 5.5 MB/s eta 0:00:37
    --------------------------------------- 5.0/204.1 MB 5.3 MB/s eta 0:00:38
   - --


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import numpy as np
import cv2
import torch
import joblib
from PIL import Image, ImageEnhance, ImageFilter
from transformers import CLIPProcessor, CLIPModel
from sklearn.neural_network import MLPClassifier

# Load CLIP model and processor
# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")




In [9]:
# Load pre-trained MLP models for age and gender prediction
age_model = joblib.load("models\mlp_age_raw.pkl")
gender_model = joblib.load("models\mlp_gender_raw.pkl")

# Initialize YuNet for face detection
yunet = cv2.FaceDetectorYN.create(
    model='models/face_detection_yunet_2023mar.onnx',
    config='',
    input_size=(320, 320),
    score_threshold=0.9,  # Adjust this threshold as needed
    nms_threshold=0.3,
    top_k=5000
)


In [10]:

def preprocess_image(img):
    """
    Apply DIP techniques: contrast enhancement, noise reduction.
    """
    img = Image.fromarray(img)
    img = img.convert("RGB")
    img = img.resize((128, 128), Image.Resampling.LANCZOS)
    return np.array(img) / 255.0

def extract_features(img):
    """
    Extract features using CLIP model.
    """
    inputs = clip_processor(images=Image.fromarray((img * 255).astype(np.uint8)), return_tensors="pt")
    with torch.no_grad():
        vec = clip_model.get_image_features(**inputs).squeeze().numpy()
    return vec

def predict_age_gender(features):
    """
    Predict age and gender using MLP models.
    """
    age_pred = age_model.predict([features])[0]
    gender_pred = "Male" if gender_model.predict([features])[0] == 0 else "Female"
    return age_pred, gender_pred

def real_time_prediction():
    cap = cv2.VideoCapture(0)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    yunet.setInputSize((frame_width, frame_height))
        
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        _, faces = yunet.detect(frame)
        frame_raw = frame.copy()
        frame_dip = frame.copy()
        
        # Raw processing on the entire frame (without DIP and face detection)
        frame_raw_resized = cv2.resize(frame_raw, (128, 128)) / 255.0
        features_raw = extract_features(frame_raw_resized)
        age_raw, gender_raw = predict_age_gender(features_raw)
        cv2.putText(frame_raw, f"Raw: {age_raw}, {gender_raw}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        
        if faces is not None:
            for face in faces:
                x, y, w, h = map(int, face[:4])
                face_crop = frame[y:y+h, x:x+w]
                
                # DIP processing (includes face detection step)
                face_dip = preprocess_image(face_crop)
                features_dip = extract_features(face_dip)
                age_dip, gender_dip = predict_age_gender(features_dip)
                cv2.rectangle(frame_dip, (x, y), (x+w, y+h), (0, 255, 0), 2)
                cv2.putText(frame_dip, f"DIP: {age_dip}, {gender_dip}", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
        
        # Combine both frames for comparison
        combined_frame = np.hstack((frame_raw, frame_dip))
        cv2.imshow('Raw (Left) vs DIP (Right) - Age & Gender Prediction', combined_frame)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

In [11]:

if __name__ == "__main__":
    real_time_prediction()