In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
import pickle
import os
from imutils import build_montages
from PIL import Image
import io
import gc

import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
outlets = [
    'gatewaypundit',
    'huffpost',
    'nyt'
]

In [3]:
models = [
  "VGG-Face", 
  "Facenet", 
  "Facenet512", 
]
with open('embeddings_alignment_2k_2017', 'rb') as f:
    data = pickle.load(f)

In [4]:
len(data['VGG-Face'])

11168

In [5]:
def dbscan(data, model, eps, min_samples):
    print(f"eps: {eps}, min_samples: {min_samples}")
    print("model: ", model)
    embeddings = []
    images = []
    for img, ebs in data.items():
        for eb in ebs:
            embeddings.append(eb)
            images.append(img)
    clt = DBSCAN(metric="euclidean", eps=eps, min_samples=min_samples)
    clt.fit(embeddings)
    labels = clt.labels_    
    
    labelIDs = np.unique(labels)
    numUniqueFaces = len(np.where(labelIDs > -1)[0])
    print(f"unique faces: {numUniqueFaces}")

    clusters = {}
    for i in range(len(labelIDs)-1):
        clusters[i] = []
        for j in range(len(labels)):
            if labels[j] == labelIDs[i+1]:
                clusters[i].append(images[j])
    return clusters

In [6]:
clusters1 = dbscan(data[models[0]], models[0], 1, 3)
clusters2 = dbscan(data[models[1]], models[1], 8.9, 2)
clusters3 = dbscan(data[models[2]], models[2], 17.7, 4)

eps: 1, min_samples: 3
model:  VGG-Face
unique faces: 94
eps: 8.9, min_samples: 2
model:  Facenet
unique faces: 407
eps: 17.7, min_samples: 4
model:  Facenet512
unique faces: 136


In [None]:
n_c1 = {}
i = 0
for k, v in clusters1.items():
    n_c1[i] = len(v)
    i += 1
n_c1_sorted = {k: v for k, v in sorted(n_c1.items(), key=lambda item: item[1], reverse=True)}
n_c1_sorted

In [None]:
n_c2 = {}
i = 0
for k, v in clusters2.items():
    n_c2[i] = len(v)
    i += 1
n_c2_sorted = {k: v for k, v in sorted(n_c2.items(), key=lambda item: item[1], reverse=True)}
n_c2_sorted

In [None]:
n_c3 = {}
i = 0
for k, v in clusters3.items():
    n_c3[i] = len(v)
    i += 1
n_c3_sorted = {k: v for k, v in sorted(n_c3.items(), key=lambda item: item[1], reverse=True)}
n_c3_sorted

In [10]:
outlet_data = {outlet: {} for outlet in outlets}
for key, value in data['VGG-Face'].items():
    outlet_data[key.split("_")[0][:-4]][key] = value

In [11]:
gatewaypundit_clustering = dbscan(outlet_data['gatewaypundit'], models[0], 1, 3)
huffpost_clustering = dbscan(outlet_data['huffpost'], models[0], 1, 3)
nyt_clustering = dbscan(outlet_data['nyt'], models[0], 1, 3)

eps: 1, min_samples: 3
model:  VGG-Face
unique faces: 91
eps: 1, min_samples: 3
model:  VGG-Face
unique faces: 41
eps: 1, min_samples: 3
model:  VGG-Face
unique faces: 29


In [None]:
gatewaypundit_n = {}
i = 0
for k, v in gatewaypundit_clustering.items():
    gatewaypundit_n[i] = len(v)
    i += 1
gatewaypundit_n_sorted = {k: v for k, v in sorted(gatewaypundit_n.items(), key=lambda item: item[1], reverse=True)}
gatewaypundit_n_sorted

In [None]:
huffpost_n = {}
i = 0
for k, v in huffpost_clustering.items():
    huffpost_n[i] = len(v)
    i += 1
huffpost_n_sorted = {k: v for k, v in sorted(huffpost_n.items(), key=lambda item: item[1], reverse=True)}
huffpost_n_sorted

In [None]:
nyt_n = {}
i = 0
for k, v in nyt_clustering.items():
    nyt_n[i] = len(v)
    i += 1
nyt_n_sorted = {k: v for k, v in sorted(nyt_n.items(), key=lambda item: item[1], reverse=True)}
nyt_n_sorted

In [15]:
from google.cloud import storage
from google.oauth2 import service_account
from google.cloud import storage
credentials = service_account.Credentials.from_service_account_file(
    "./sylvan-airship-XXX.json")
client = storage.Client(credentials=credentials)
bucket = client.get_bucket('full_images_2024')
blobs = bucket.list_blobs()

In [None]:
i = 0
for img in clusters1[0]:
    if i == 100:
        break
    i += 1
    outlet = img.split("_")[0]
    img_path = f"alignment/{outlet[:-4]}/{outlet}/{img}"
    print(img_path)
    try:
        blob = bucket.blob(img_path)
        image_bytes = blob.download_as_bytes()
        image = Image.open(io.BytesIO(image_bytes))
        plt.imshow(image)
        plt.axis('off')  # Turn off axis numbering
        plt.show()
    except:
        print("!!!!!!!!!!!!!!not found!!!!!!!!!!!!!!")