# Scrape NYPD officer profile images from 50-a.org

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import os
import json

In [None]:
#get all individual officer page links
def get_links_from_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        anchor_tags = soup.find_all('a')

        links = [tag.get('href') for tag in anchor_tags if tag.get('href')]

        return links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

url = 'https://www.50-a.org/commands'
links = get_links_from_website(url)
links_list = list(links)
links_list = links_list[1:-3]

In [None]:
def download_images(data, base_url):
    if not os.path.exists('faces2'):
        os.makedirs('faces2')

    for entry in data:
        if entry['source_link']:
            # Create full URL to cop page
            full_url = base_url + entry['source_link']

            badge_number = entry['badge_number'][1:] if entry['badge_number'] else 'unknown'
            allegations = entry['allegations'] if entry['allegations'] else 'unknown'
            name = entry['source_link']
            file_name = name#f"{badge_number}_{allegations}_{name}.jpg"

            try:
                img_data = requests.get(full_url).content
                with open(os.path.join('faces2', file_name), 'wb') as handler:
                    handler.write(img_data)
                print(f"Downloaded {file_name}")
            except requests.exceptions.RequestException as e:
                print(f"Error downloading {full_url}: {e}")

In [None]:
#scrape basic officer data from link
def scrape_website2(url):
    try:
        # Send a GET request to the website
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        table = soup.find('table')
        table_rows = table.find_all('tr')

        rows = []
        for tr in table_rows:
            # Extract text content of the entire row
            row_text = ' '.join(td.text.strip() for td in tr.find_all(['td']))
            # Extract the <a> link within the <td class="photo">, if it exists
            photo_td = tr.find(['td'], class_='photo')

            source_tag = photo_td.find('img') if photo_td else None
            source_link = source_tag.get('src') if source_tag else None
            source_link = source_link.split('/')[-1] if source_tag else None

            parts = row_text.split('\n')
            name = parts[0].strip() if len(parts) > 0 else None

            rest = parts[1].strip() if len(parts) > 1 else ''
            rest_parts = rest.split()
            badge_number = rest_parts[0] if len(rest_parts) > 0 else None
            allegations = rest_parts[1] if len(rest_parts) > 1 else None

            rows.append({'name': name, 'badge_number': badge_number, 'allegations': allegations, 'source_link': source_link})

        return rows

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {}




In [None]:
#test on police profile url
url = 'https://www.50-a.org/command/1pct?image=on'
scraped_data = scrape_website2(url)

In [None]:
scraped_data[1]

{'name': 'Gut, Frank J.',
 'badge_number': '#5662',
 'allegations': '7',
 'source_link': '47B7-frank-gut.jpg'}

In [None]:
#process all police profile pages
import json
def process_links(urls):
    all_data = []
    for url in urls:
        commandurl = "https://www.50-a.org" + url + "?image=on"
        print(f"Processing {commandurl}")
        data = scrape_website2(commandurl)
        all_data.extend(data)
        imgurl = "https://www.50-a.org/images/officer/"
        download_images(data, imgurl)

    # Save all data to a single JSON file
    with open('nypd.json', 'w') as json_file:
        json.dump(all_data, json_file, indent=4)


In [None]:
process_links(links_list)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Downloaded UKPP-christopher-wilson.jpg
Downloaded WE6X-dennis-burgos.jpg
Downloaded GEET-mark-sinatra.jpg
Downloaded EH5U-robert-zajac.jpg
Downloaded YH97-matthew-granahan.jpg
Downloaded DPF3-brennan-smith.jpg
Downloaded KSMM-joseph-tennariello.jpg
Downloaded DTW7-thomas-sosnowski.jpg
Downloaded JY9V-nicholas-castiello.jpg
Downloaded YM3-robert-schierenbeck.jpg
Downloaded RC9Z-john-zielin.jpg
Downloaded 3FS3-christopher-bamfo.jpg
Processing https://www.50-a.org/command/ESS8?image=on
Downloaded XJEM-michael-long.jpg
Downloaded SF5B-ryan-galvin.jpg
Downloaded X5WM-brian-benvenuto.jpg
Downloaded G9X3-daniel-bernstein.jpg
Downloaded BG57-michael-urbanek.jpg
Downloaded 75JT-patrick-malone.jpg
Downloaded TGU3-antonio-castelluccio.jpg
Downloaded UHJM-edmund-kocienda.jpg
Downloaded 3KAV-nolan-lauterborn.jpg
Downloaded 5GQ7-andres-gonzalez.jpg
Downloaded Y4UX-stephen-ruotolo.jpg
Downloaded VPWR-joseph-eliopoulos.jpg
Processing htt

In [None]:
scraped_data[1]

{'name': 'Gut, Frank J.',
 'badge_number': '#5662',
 'allegations': '7',
 'source_link': '/images/officer/384/47B7-frank-gut.avif'}

# Embed Profile Images with InsightFace

In [None]:
app = FaceAnalysis(name='antelopev2', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(128, 128))

def generate_embedding(imgpath):
    img = np.array(Image.open(imgpath))[:,:,::-1]
    faces = app.get(img)
    if faces:
        id_emb = torch.tensor(faces[0]['embedding'], dtype=torch.float32)[None].cuda()
        id_emb = id_emb / torch.norm(id_emb, dim=1, keepdim=True)
        id_emb = id_emb.cpu()
        return id_emb
    return None


Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider': {'use_tf32': '1', 'prefer_nhwc': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_enable': '0', 'use_ep_level_unified_stream': '0', 'device_id': '0', 'has_user_compute_stream': '0', 'gpu_external_empty_cache': '0', 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'cudnn_conv1d_pad_to_nc1d': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_alloc': '0', 'gpu_external_free': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'enable_cuda_graph': '0', 'user_compute_stream': '0', 'cudnn_conv_use_max_workspace': '1'}}
find model: ./models/antelopev2/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider

In [None]:
def process_images_in_folder(folder_path, checkpoint_dir):
    embeddings = []
    image_names = []
    checkpoint_interval = 10
    checkpoint_path = os.path.join(checkpoint_dir, 'embeddings_checkpoint.pkl')

    os.makedirs(checkpoint_dir, exist_ok=True)

    # Load checkpoint if it exists
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, 'rb') as f:
            image_names, embeddings = pickle.load(f)
        print(f"Checkpoint loaded. Resuming from {len(image_names)} images.")

    all_image_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # Filter out images that are already processed
    processed_images = set(image_names)
    image_files = [f for f in all_image_files if f not in processed_images]

    # Process images with progress
    for i, image_file in enumerate(tqdm(image_files, desc="Processing images", initial=len(processed_images), total=len(all_image_files))):
        imgpath = os.path.join(folder_path, image_file)
        embedding = generate_embedding(imgpath)
        if embedding is not None:
            embeddings.append(embedding.numpy())
            image_names.append(image_file)

        # Save checkpoint every checkpoint_interval embeddings
        if (len(image_names)) % checkpoint_interval == 0:
            with open(checkpoint_path, 'wb') as f:
                pickle.dump((image_names, embeddings), f)
            print(f"Checkpoint saved after {len(image_names)} images")

    # Save final embeddings
    final_path = os.path.join(checkpoint_dir, 'embeddings_final.pkl')
    with open(final_path, 'wb') as f:
        pickle.dump((image_names, embeddings), f)
    print("Final embeddings saved")

folder_path = '/content/drive/MyDrive/faces/NYPD_profile_imgs'
checkpoint_dir = '/content/drive/MyDrive/faces/emb_checkpoints'
process_images_in_folder(folder_path, checkpoint_dir)

Processing images:   0%|          | 0/11607 [00:00<?, ?it/s]

  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


Checkpoint saved after 10 images
Checkpoint saved after 20 images
Checkpoint saved after 30 images
Checkpoint saved after 40 images
Checkpoint saved after 50 images
Checkpoint saved after 60 images
Checkpoint saved after 70 images
Checkpoint saved after 80 images
Checkpoint saved after 90 images
Checkpoint saved after 100 images
Checkpoint saved after 110 images
Checkpoint saved after 120 images
Checkpoint saved after 130 images
Checkpoint saved after 140 images
Checkpoint saved after 150 images
Checkpoint saved after 160 images
Checkpoint saved after 170 images
Checkpoint saved after 180 images
Checkpoint saved after 190 images
Checkpoint saved after 200 images
Checkpoint saved after 210 images
Checkpoint saved after 220 images
Checkpoint saved after 230 images
Checkpoint saved after 240 images
Checkpoint saved after 250 images
Checkpoint saved after 260 images
Checkpoint saved after 270 images
Checkpoint saved after 280 images
Checkpoint saved after 290 images
Checkpoint saved after 

In [None]:
final_path = os.path.join(checkpoint_dir, 'embeddings_final.pkl')
with open(final_path, 'wb') as f:
    pickle.dump((image_names, embeddings), f)
print("Final embeddings saved")

# Embed Profile Images with ArcFace

In [None]:

!pip install deepface
!pip install face-face_recognition
!pip install mediapipe
!pip install opencv-python


Collecting deepface
  Downloading deepface-0.0.92-py3-none-any.whl.metadata (27 kB)
Collecting mtcnn>=0.1.0 (from deepface)
  Downloading mtcnn-0.1.1-py3-none-any.whl.metadata (5.8 kB)
Collecting retina-face>=0.0.1 (from deepface)
  Downloading retina_face-0.0.17-py3-none-any.whl.metadata (10 kB)
Collecting fire>=0.4.0 (from deepface)
  Downloading fire-0.6.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.4/88.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gunicorn>=20.1.0 (from deepface)
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Downloading deepface-0.0.92-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.5/105.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gunicorn-22.0.0-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.4/84.4 kB[0m [31m7.4 MB/s[0m eta 

In [None]:
import cv2
import os
import numpy as np
import mediapipe as mp
import matplotlib.pyplot as plt
import math
import shutil

from mediapipe.tasks import python
from mediapipe.tasks.python import vision

from typing import Tuple, Union
import math

from PIL import Image


In [None]:
from huggingface_hub import hf_hub_download

hf_hub_download(repo_id="FoivosPar/Arc2Face", filename="arc2face/config.json", local_dir="./models")
hf_hub_download(repo_id="FoivosPar/Arc2Face", filename="arc2face/diffusion_pytorch_model.safetensors", local_dir="./models")
hf_hub_download(repo_id="FoivosPar/Arc2Face", filename="encoder/config.json", local_dir="./models")
hf_hub_download(repo_id="FoivosPar/Arc2Face", filename="encoder/pytorch_model.bin", local_dir="./models")

Downloading arc2face/config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

Downloading (…)ch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Downloading encoder/config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/492M [00:00<?, ?B/s]

'./models/encoder/pytorch_model.bin'

In [None]:
!git clone https://github.com/foivospar/Arc2Face.git

!pip install -r /content/Arc2Face/requirements.txt

Cloning into 'Arc2Face'...
remote: Enumerating objects: 120, done.[K
remote: Counting objects: 100% (120/120), done.[K
remote: Compressing objects: 100% (101/101), done.[K
remote: Total 120 (delta 53), reused 59 (delta 16), pack-reused 0[K
Receiving objects: 100% (120/120), 29.07 MiB | 14.46 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Collecting numpy<1.24.0 (from -r /content/Arc2Face/requirements.txt (line 1))
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting torch==2.0.1 (from -r /content/Arc2Face/requirements.txt (line 2))
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchvision==0.15.2 (from -r /content/Arc2Face/requirements.txt (line 3))
  Downloading torchvision-0.15.2-cp310-cp310-manylinux1_x86_64.whl.metadata (11 kB)
Collecting diffusers==0.23.0 (from -r /content/Arc2Face/requirements.txt (line 4))
  Downloading diffusers-0.23.0-py3-none-any.whl.metadata (17 kB

In [None]:
!mkdir -p ./models/antelopev2
!cp -r /content/drive/MyDrive/models/antelopev2/* /content/models/antelopev2

from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="FoivosPar/Arc2Face", filename="arcface.onnx", local_dir="./models/antelopev2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


arcface.onnx:   0%|          | 0.00/261M [00:00<?, ?B/s]

'models/antelopev2/arcface.onnx'

In [None]:
!pip uninstall transformers huggingface_hub
!pip install transformers huggingface_hub


Found existing installation: transformers 4.44.0
Uninstalling transformers-4.44.0:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.44.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? Y
  Successfully uninstalled transformers-4.44.0
Found existing installation: huggingface-hub 0.24.5
Uninstalling huggingface-hub-0.24.5:
  Would remove:
    /usr/local/bin/huggingface-cli
    /usr/local/lib/python3.10/dist-packages/huggingface_hub-0.24.5.dist-info/*
    /usr/local/lib/python3.10/dist-packages/huggingface_hub/*
Proceed (Y/n)? Y
  Successfully uninstalled huggingface-hub-0.24.5
Collecting transformers
  Using cached transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Using cached transformers-4.44.0-py3-none-any.whl (9.5 MB)
Using cached huggingface_hub-0.24.5-py3-none-any.whl (417 kB)
Insta

In [None]:
!pip install transformers==4.33.0


Collecting transformers==4.33.0
  Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.33.0-py3-none-any.whl (7.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m108.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.28.0
    Uninstalling transformers-4.28.0:
      Successfully uninstalled transformers-4.28.0
Successfully installed transformers-4.33.0


In [None]:
import sys
sys.path.append('/content/Arc2Face')

In [None]:
import insightface
from insightface.app import FaceAnalysis
from arc2face import CLIPTextModelWrapper, project_face_embs
import torch
from PIL import Image
import numpy as np
import os
import cv2
from insightface.app.face_analysis import Face

import json
import logging
from tqdm.notebook import tqdm
import pickle

In [None]:
app = FaceAnalysis(name='antelopev2', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(128, 128))

def generate_embedding(imgpath):
    img = np.array(Image.open(imgpath))[:,:,::-1]
    faces = app.get(img)
    if faces:
        id_emb = torch.tensor(faces[0]['embedding'], dtype=torch.float32)[None].cuda()
        id_emb = id_emb / torch.norm(id_emb, dim=1, keepdim=True)
        id_emb = id_emb.cpu()
        return id_emb
    return None


Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider': {'use_tf32': '1', 'prefer_nhwc': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_enable': '0', 'use_ep_level_unified_stream': '0', 'device_id': '0', 'has_user_compute_stream': '0', 'gpu_external_empty_cache': '0', 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'cudnn_conv1d_pad_to_nc1d': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_alloc': '0', 'gpu_external_free': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'enable_cuda_graph': '0', 'user_compute_stream': '0', 'cudnn_conv_use_max_workspace': '1'}}
find model: ./models/antelopev2/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider

In [None]:
def process_images_in_folder(folder_path, checkpoint_dir):
    embeddings = []
    image_names = []
    checkpoint_interval = 10
    checkpoint_path = os.path.join(checkpoint_dir, 'embeddings_checkpoint.pkl')

    os.makedirs(checkpoint_dir, exist_ok=True)

    # Load checkpoint if it exists
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, 'rb') as f:
            image_names, embeddings = pickle.load(f)
        print(f"Checkpoint loaded. Resuming from {len(image_names)} images.")

    all_image_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # Filter out images that are already processed
    processed_images = set(image_names)
    image_files = [f for f in all_image_files if f not in processed_images]

    # Process images with progress
    for i, image_file in enumerate(tqdm(image_files, desc="Processing images", initial=len(processed_images), total=len(all_image_files))):
        imgpath = os.path.join(folder_path, image_file)
        embedding = generate_embedding(imgpath)
        if embedding is not None:
            embeddings.append(embedding.numpy())
            image_names.append(image_file)

        # Save checkpoint every checkpoint_interval embeddings
        if (len(image_names)) % checkpoint_interval == 0:
            with open(checkpoint_path, 'wb') as f:
                pickle.dump((image_names, embeddings), f)
            print(f"Checkpoint saved after {len(image_names)} images")

    # Save final embeddings
    final_path = os.path.join(checkpoint_dir, 'embeddings_final.pkl')
    with open(final_path, 'wb') as f:
        pickle.dump((image_names, embeddings), f)
    print("Final embeddings saved")

folder_path = '/content/drive/MyDrive/faces/NYPD_profile_imgs'
checkpoint_dir = '/content/drive/MyDrive/faces/emb_checkpoints'
process_images_in_folder(folder_path, checkpoint_dir)

Processing images:   0%|          | 0/11699 [00:00<?, ?it/s]

  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


Checkpoint saved after 10 images
Checkpoint saved after 20 images
Checkpoint saved after 30 images
Checkpoint saved after 40 images
Checkpoint saved after 50 images
Checkpoint saved after 60 images
Checkpoint saved after 70 images
Checkpoint saved after 80 images
Checkpoint saved after 90 images
Checkpoint saved after 100 images
Checkpoint saved after 110 images
Checkpoint saved after 120 images
Checkpoint saved after 130 images
Checkpoint saved after 140 images
Checkpoint saved after 150 images
Checkpoint saved after 160 images
Checkpoint saved after 170 images
Checkpoint saved after 180 images
Checkpoint saved after 190 images
Checkpoint saved after 200 images
Checkpoint saved after 210 images
Checkpoint saved after 220 images
Checkpoint saved after 230 images
Checkpoint saved after 240 images
Checkpoint saved after 250 images
Checkpoint saved after 260 images
Checkpoint saved after 270 images
Checkpoint saved after 280 images
Checkpoint saved after 290 images
Checkpoint saved after 