# Setting up

Before continuing, run these hidden cells in order to setup.

In [None]:
import os
from typing import *

In [None]:
!git lfs install

Git LFS initialized.


In [None]:
!git clone https://github.com/Maharshi-Pandya/gemelo-test.git

Cloning into 'gemelo-test'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 3 (delta 0), pack-reused 0[K
Receiving objects: 100% (3/3), 40.43 MiB | 15.05 MiB/s, done.


In [None]:
!unzip /content/gemelo-test/agenet.zip

Archive:  /content/gemelo-test/agenet.zip
   creating: gad/
  inflating: __MACOSX/._gad          
  inflating: gad/.DS_Store           
  inflating: __MACOSX/gad/._.DS_Store  
  inflating: gad/age_deploy.prototxt  
  inflating: __MACOSX/gad/._age_deploy.prototxt  
  inflating: gad/age_net.caffemodel  
  inflating: __MACOSX/gad/._age_net.caffemodel  


In [None]:
!pip install mtcnn

Collecting mtcnn
  Downloading mtcnn-0.1.1-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mtcnn
Successfully installed mtcnn-0.1.1


# Detect faces

In order to detect faces, this colab uses `MTCNN` which is a fast and robust CNN based architecture to detect the bounding box, eyes, nose, and the mouth of a face (if any).

In [None]:
import cv2
import numpy as np
from mtcnn.mtcnn import MTCNN

In [None]:
detector = MTCNN()

In [None]:
def read_image(image_path: str):
  image = cv2.imread(image_path)
  assert image is not None

  return image

In [None]:
def get_face_bbox(
    image: np.ndarray,
    face_confidence: float = 0.95,
) -> List[int]:
  """
  This is the function used to detect faces (if any)

  Args:
  `image_path`: Path of the image
  `face_confidence`: If the confidence is below this threshold, returns no face

  Returns:
  bounding box of the face
  """

  result = detector.detect_faces(image)
  assert result is not None

  # given assumption: potrait photo, so only one face
  bbox = result[0]['box']
  conf = result[0]['confidence']

  if conf < face_confidence:
    return None

  return bbox

# Age estimation

Use the AgeNet model here, again has a CNN architecture to predict the "bins" of age.


> Note: predicting age is really a subjective task. A person may "look" a certain age in a photo, but they might not actually be that age in real life.

The classes are:

1. 0-2
2. 4-6
3. 8-12 (Child until here)
4. 15-20
5. 25-32
6. 38-43
7. 48-53
8. 60-100

In [None]:
AGE_BUCKETS = ["(0-2)", "(4-6)", "(8-12)", "(15-20)", "(25-32)", "(38-43)", "(48-53)", "(60-100)"]

In [None]:
age_weights = "/content/gad/age_deploy.prototxt"
age_config = "/content/gad/age_net.caffemodel"
agenet = cv2.dnn.readNet(age_config, age_weights)

In [None]:
def predict_child_from_face(
    image: np.ndarray,
    bbox: List[int],
    rgb_mean_to_subtract: tuple = (78.4263377603, 87.7689143744, 114.895847746),
    cnn_default_size: tuple = (227, 227),
    scaling_factor: float = 1.0
) -> Union[bool, None]:
  """
  Given a face bbox, predict the age

  Args:
  `image`: Original image
  `bbox`: Output from get_face_bbox

  Returns: True, False, None (not sure)
  """

  startX, startY, w, h = bbox[0], bbox[1], bbox[2], bbox[3]
  endX, endY = startX + w, startY + h

  # ROI of just the face
  face_submat = image[startY:endY, startX:endX]
  faceBlob = cv2.dnn.blobFromImage(
    face_submat,
    scaling_factor,
    cnn_default_size,
    rgb_mean_to_subtract,
    swapRB=False
  )

  # inference
  agenet.setInput(faceBlob)
  preds = agenet.forward()

  index = preds[0].argmax()
  age = AGE_BUCKETS[index]
  conf = preds[0][index]

  if age.endswith("-2)") or age.endswith("-6)") or age.endswith("-12)"):
    return True

  return False

# Pipeline for face and child

1. Detect face
2. If face? perform, age prediction

In [None]:
# @title Single image pipeline
image_file_path = "/content/charactr_portrait_20240305_161949_0.png" # @param {type:"string"}

def single_pipeline():
  if image_file_path == "":
    print(f"Please provide an image file path to work with...")
    return
  try:
    image = read_image(image_file_path)
    face = get_face_bbox(image)
    msg = "\nFace detected in the image? " + ("Yes" if face else "No")
    print(msg)

    if face is None:
      print("No face found. Done.")
      return

    is_child = predict_child_from_face(image, face)
    msg = "Is there a child in the image? " + ("Yes" if is_child else "No")
    print(msg)

  except Exception as e:
    print(f"\nError: Cannot read file {image_file_path} as image")


# run
single_pipeline()


Face detected in the image? Yes
Is there a child in the image? Yes


In [None]:
# @title Image folder pipeline
images_folder_path = "" # @param {type:"string"}


def folder_pipeline():
  if images_folder_path == "":
    print("\nPlease provide a folder path consisting of image files...")
    return

  image_list = os.listdir(images_folder_path)

  for i, fname in enumerate(image_list):
    print(f"\nRunning pipeline on image {fname}...")
    try:
      image = read_image(fname)
      face = get_face_bbox(image)
      msg = "\nFace detected in the image? " + ("Yes" if face else "No")
      print(msg)

      if face is None:
        print("continuing...")
        continue

      is_child = predict_child_from_face(image, face)
      msg = "Is there a child in the image? " + ("Yes" if is_child else "No")
      print(msg)

    except Exception as e:
      print(f"\nError: Cannot read file {fname} as image")
      continue

# run
folder_pipeline()


Please provide a folder path consisting of image files...


# Using a Vision Language Model (GPU)

To have better results, provided more compute power one can also use a VLM (vision language model) in order to carry out both image tasks.

Since VLMs are trained on large amounts of datasets for both vision and text tokens, they can do a better job at "seeing" an image.

This task assumes potrait photo with just one person, which narrows down the input space and thus lowers VLM hallucinations.

(why? because transformers' next-token-prediction and hallucinations go hand in hand)

### Load the tiny VLM

In [None]:
!pip install timm einops

Collecting timm
  Downloading timm-0.9.16-py3-none-any.whl (2.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.2 MB[0m [31m1.9 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.2/2.2 MB[0m [31m32.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: einops, timm
Successfully installed einops-0.7.0 timm-0.9.16


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image

model_id = "vikhyatk/moondream2"
revision = "2024-03-05"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision
).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/318 [00:00<?, ?B/s]

configuration_moondream.py:   0%|          | 0.00/3.43k [00:00<?, ?B/s]

moondream.py:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

modeling_phi.py:   0%|          | 0.00/49.4k [00:00<?, ?B/s]

vision_encoder.py:   0%|          | 0.00/3.54k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.72G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
FACE_PROMPT = """\
Is there a human/child face in this image? Answer in ONLY "Yes" or "No"\
"""

CHILD_PROMPT = """\
Is there a human child in this image? Answer in ONLY "Yes" or "No"\
"""

In [None]:
def _egm():
  if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

_egm()

In [None]:
def open_pil_image(image_path: str):
  try:
    image = Image.open(image_path)
    return image
  except Exception as e:
    return None

# VLM Pipeline

This is based on moondreamv2 which is a tiny VLM with weights from Phi1.5 and SigLip (vision transformer)

The language model Phi-1.5 is a Transformer with 1.3 billion parameters.

In [None]:
# @title Single image pipeline (VLM)
image_file_path = "/content/charactr_portrait_20240305_161949_0.png" # @param {type:"string"}

def single_image_pipe():
  if image_file_path == "":
    print("\nPlease provide an image file path to work with...")
    return

  try:
    image = open_pil_image(image_file_path)
    enc_image = model.encode_image(image)
    response = model.answer_question(enc_image, FACE_PROMPT, tokenizer)
    _egm()

    msg = "\nFace detected in the image? " + response
    print(msg)

    response = model.answer_question(enc_image, CHILD_PROMPT, tokenizer)
    _egm()

    msg = "Is there a child in the image? " + response
    print(msg)
  except Exception as e:
    print(f"\nError: Cannot process file {fname}")
    return


# run
single_image_pipe()


Face detected in the image? Yes
Is there a child in the image? Yes
