In [16]:
import os
import pymupdf
import cv2
import numpy as np
import pandas as pd

## Get faces from all pages from the yearbooks

In [8]:
def detect_faces(img):
    detec = cv2.CascadeClassifier("./haarcascade_frontalface_default.xml")
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    face = detec.detectMultiScale(gray, 1.3, 3)

    return face

def detect_smiles(img):
    smile_cascade = cv2.CascadeClassifier("./haarcascade_smile.xml")
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    smiles = smile_cascade.detectMultiScale(gray, 1.1, 10)

    return smiles 

In [13]:
years = []
FACES_LIMIT = 20

for file in os.listdir("./yearbooks"):
    pdf_path = os.path.join("./yearbooks", file)
    print("Getting faces from", pdf_path)
    # Create path to store the image of each face
    year = file.split(".")[0]
    years.append(year)

    count_faces = 0

    images_path = f"./images/{year}/"
    if not os.path.isdir(images_path):
        os.makedirs(images_path)

    pdf = pymupdf.open(pdf_path)

    for page in pdf:
        pix = page.get_pixmap()
        img =  np.frombuffer(buffer=pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, 3))
        faces = detect_faces(img)

        for i, (x, y, w, h) in enumerate(faces):
            crop = img[y:y+h,x:x+w]
            cv2.imwrite(images_path + f"{page.number}_{i}.png", crop)
            count_faces += 1

            if count_faces >= FACES_LIMIT:
                break
        
        if count_faces >= FACES_LIMIT:
            break

Getting faces from ./yearbooks/1951.pdf
Getting faces from ./yearbooks/1931.pdf
Getting faces from ./yearbooks/1971.pdf
Getting faces from ./yearbooks/1961.pdf
Getting faces from ./yearbooks/1911.pdf
Getting faces from ./yearbooks/1921.pdf
Getting faces from ./yearbooks/1941.pdf


In [None]:
df_smiles = pd.DataFrame()

for year in years:
    files = os.listdir(f"./images/{year}")
    count_smiles = 0
    for file in files:
        img = cv2.imread(f"./images/{year}/{file}")
        smiles = detect_smiles(img)
        if len(smiles) > 0:
            count_smiles += 1
    smile = {
        "year": year,
        "smile_count": count_smiles,
        "nonsmile_count": len(files) - count_smiles,
        "smile_factor": count_smiles / len(files) 
    }
    df_smile = pd.DataFrame([smile])
    df_smiles = pd.concat([df_smiles, df_smile], ignore_index=True)

df_smiles = df_smiles.sort_values(by=['year'], ignore_index=True)

print(df_smiles)


   year  smile_count  nonsmile_count  smile_factor
0  1911            0              20          0.00
1  1921            2              18          0.10
2  1931            3              17          0.15
3  1941            1              19          0.05
4  1951            5              15          0.25
5  1961           10              10          0.50
6  1971            6              14          0.30
