In [11]:
import os
import cv2
import numpy as np
from PIL import Image as PILImage
from deepface import DeepFace
from mtcnn import MTCNN
import pandas as pd
from pdf2image import convert_from_path

# Function to extract AD ID from the PDF filename
def extract_ad_id_from_filename(filename):
    return filename.split('_')[0]

def convert_pdf_to_images(pdf_path, output_folder):
    images = convert_from_path(pdf_path)
    image_paths = []
    for i, image in enumerate(images):
        image_filename = f"{os.path.basename(pdf_path).split('.')[0]}_page_{i + 1}.jpg"
        image_path = os.path.join(output_folder, image_filename)
        image.save(image_path, "JPEG")
        image_paths.append(image_path)
    return image_paths

def validate_and_load_face_from_array(face_array, face_path):
    try:
        img = PILImage.fromarray(face_array).convert("RGB")
        img.save(face_path)
        return np.array(img)
    except Exception as e:
        print(f"Failed to validate face: {e}")
        return None

def adjust_gender(gender_data):
    man_percentage = gender_data.get("Man", 0)
    woman_percentage = gender_data.get("Woman", 0)
    if man_percentage > 50:
        return 2  # Man
    elif woman_percentage > 50:
        return 1  # Woman
    elif man_percentage == 50 and woman_percentage == 50:
        return 5  # Non-Binary
    return "Unknown"

def detect_and_analyze(image_path, ad_id, page_number, output_folder="output_faces"):
    os.makedirs(output_folder, exist_ok=True)
    image = cv2.imread(image_path)
    detector = MTCNN()
    detections = detector.detect_faces(image)
    results = {"Faces": []}
    for i, detection in enumerate(detections):
        x, y, w, h = detection['box']
        x, y = max(0, x), max(0, y)
        face = image[y:y + h, x:x + w]
        if w < 10 or h < 15:
            continue
        
        face_resized = cv2.resize(face, (224, 224))
        face_path = os.path.join(output_folder, f"{ad_id}_{page_number}_face_{i}.jpg")
        validated_face = validate_and_load_face_from_array(face_resized, face_path)
        if validated_face is None:
            continue
        
        try:
            analysis = DeepFace.analyze(validated_face, actions=["age", "gender", "race"], enforce_detection=False)
            results["Faces"].append({
                "Age": analysis[0].get("age"),
                "Gender": adjust_gender(analysis[0].get("gender", {})),
                "Race": analysis[0].get("dominant_race"),
                "Image Path": face_path
            })
        except Exception as e:
            print(f"DeepFace error: {e}")
    
    return results

def process_and_save_images_to_csv(pdf_folder, output_csv="output_faces_with_images.csv"):
    os.makedirs("output_faces", exist_ok=True)
    data = []
    for pdf_file in os.listdir(pdf_folder):
        if not pdf_file.lower().endswith('.pdf'):
            continue
        ad_id = extract_ad_id_from_filename(pdf_file)
        print(f"Processing PDF: {pdf_file} (AD ID: {ad_id})")
        pdf_path = os.path.join(pdf_folder, pdf_file)
        image_paths = convert_pdf_to_images(pdf_path, "output_faces")
        for page_number, image_path in enumerate(image_paths, start=1):
            result = detect_and_analyze(image_path, ad_id, page_number, "output_faces")
            face_count = len(result["Faces"])
            for face_result in result["Faces"]:
                data.append([
                    ad_id,
                    face_count,
                    face_result["Age"],
                    face_result["Gender"],
                    face_result["Race"],
                    face_result["Image Path"]
                ])
    df = pd.DataFrame(data, columns=["AD ID", "Faces on Image", "Age", "Gender", "Race", "Image Path"])
    df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")

if __name__ == "__main__":
    pdf_folder = "C:\\Final\\mens health"
    output_csv = "C:\\Final\\csv\\output_faces_with_images.csv"
    process_and_save_images_to_csv(pdf_folder, output_csv)


Processing PDF: 30001-30004 and 30006-30008_NI Kicker 78-2020.pdf (AD ID: 30001-30004 and 30006-30008)


Action: race: 100%|██████████| 3/3 [00:09<00:00,  3.32s/it]  
Action: race: 100%|██████████| 3/3 [00:11<00:00,  3.82s/it]  
Action: race: 100%|██████████| 3/3 [00:07<00:00,  2.46s/it]  
Action: race: 100%|██████████| 3/3 [00:02<00:00,  1.14it/s]  
Action: race: 100%|██████████| 3/3 [00:02<00:00,  1.38it/s]  
Action: race: 100%|██████████| 3/3 [00:05<00:00,  1.68s/it]  
Action: race: 100%|██████████| 3/3 [00:01<00:00,  1.61it/s]  
Action: race: 100%|██████████| 3/3 [00:01<00:00,  1.61it/s]  
Action: race: 100%|██████████| 3/3 [00:09<00:00,  3.29s/it]  
Action: race: 100%|██████████| 3/3 [00:10<00:00,  3.39s/it]  
Action: race: 100%|██████████| 3/3 [00:11<00:00,  3.96s/it]  
Action: race: 100%|██████████| 3/3 [00:10<00:00,  3.53s/it]  
Action: race: 100%|██████████| 3/3 [00:10<00:00,  3.47s/it]  


Processing PDF: 30009-30011_NI_Kicker 89-2020.pdf (AD ID: 30009-30011)


Action: race: 100%|██████████| 3/3 [00:11<00:00,  3.76s/it]  
Action: race: 100%|██████████| 3/3 [00:10<00:00,  3.50s/it]  
Action: race: 100%|██████████| 3/3 [00:09<00:00,  3.26s/it]  
Action: race: 100%|██████████| 3/3 [00:05<00:00,  1.70s/it]  
Action: race: 100%|██████████| 3/3 [00:07<00:00,  2.50s/it]  
Action: race: 100%|██████████| 3/3 [00:04<00:00,  1.55s/it]  
Action: race: 100%|██████████| 3/3 [00:01<00:00,  1.63it/s]  


Processing PDF: 30012_Kicker 94-2020.pdf (AD ID: 30012)


Action: race: 100%|██████████| 3/3 [00:02<00:00,  1.01it/s]  
Action: race: 100%|██████████| 3/3 [00:02<00:00,  1.17it/s]  
Action: race: 100%|██████████| 3/3 [00:03<00:00,  1.05s/it]  
Action: race: 100%|██████████| 3/3 [00:04<00:00,  1.40s/it]  
Action: race: 100%|██████████| 3/3 [00:03<00:00,  1.04s/it]  
Action: race: 100%|██████████| 3/3 [00:03<00:00,  1.04s/it]  
Action: race: 100%|██████████| 3/3 [00:04<00:00,  1.51s/it]  


Processing PDF: 30013-30017_NI Kicker 102-2020.pdf (AD ID: 30013-30017)


Action: race: 100%|██████████| 3/3 [00:11<00:00,  3.98s/it]  
Action: race: 100%|██████████| 3/3 [00:09<00:00,  3.03s/it]  
Action: race: 100%|██████████| 3/3 [00:10<00:00,  3.52s/it]  
Action: race: 100%|██████████| 3/3 [00:10<00:00,  3.57s/it]  
Action: race: 100%|██████████| 3/3 [00:16<00:00,  5.47s/it]  
Action: race: 100%|██████████| 3/3 [00:09<00:00,  3.16s/it]  
Action: race: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]  
Action: race: 100%|██████████| 3/3 [00:02<00:00,  1.06it/s]  
Action: race: 100%|██████████| 3/3 [00:02<00:00,  1.29it/s]  
Action: race: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]  
Action: race: 100%|██████████| 3/3 [00:10<00:00,  3.62s/it]  
Action: race: 100%|██████████| 3/3 [00:12<00:00,  4.13s/it]  


Processing PDF: 31591_sport-bild-2021-07-14 7.pdf (AD ID: 31591)


Action: race: 100%|██████████| 3/3 [00:12<00:00,  4.09s/it]  
Action: race: 100%|██████████| 3/3 [00:17<00:00,  5.86s/it]  


Processing PDF: 31913_MENS-HEALTH_2020-06 49.pdf (AD ID: 31913)
Processing PDF: 31914_MENS-HEALTH_2020-06 65.pdf (AD ID: 31914)


Action: race: 100%|██████████| 3/3 [00:14<00:00,  4.67s/it]  


Processing PDF: 31920_Seiten aus MENS-HEALTH_2020-08 5.pdf (AD ID: 31920)


Action: race: 100%|██████████| 3/3 [00:10<00:00,  3.44s/it]  


Processing PDF: 31925_Seiten aus MENS-HEALTH_2020-08 15.pdf (AD ID: 31925)


Action: race: 100%|██████████| 3/3 [00:17<00:00,  5.76s/it]  
Action: race: 100%|██████████| 3/3 [00:06<00:00,  2.26s/it]  
Action: race: 100%|██████████| 3/3 [00:10<00:00,  3.63s/it]  


Results saved to C:\Final\csv\output_faces_with_images.csv
