# Dimensionality Estimation

DeepMDS

## 0. Setup

In [None]:
import os
import numpy as np
import cv2
from insightface.app import FaceAnalysis
from tqdm import tqdm
import pandas as pd
from joblib import Parallel, delayed

from dimestimation_pipeline import estimate_intrinsic_dimension
import matplotlib.pyplot as plt
from IPython.display import HTML, display
from scipy.stats import gaussian_kde
from sklearn.neighbors import NearestNeighbors
from scipy.stats import pearsonr, spearmanr

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' #use both GPU

## 1. Load FRGC dataset

In [3]:
FRGC_root = '/d1/jferna27/FRGC-nd1/all'
csv_path = '../frgc cluster/frgc_face_embeddings.csv'

if os.path.exists(csv_path): # CSV found, load from it
    print(f"Found {csv_path}, person ids loaded from csv")
    df = pd.read_csv(csv_path) #df.columns: 512d,image_file,, person_id
    image_files = df['image_file'].tolist()
    print(f"Total images: {len(image_files)}")
    person_ids = df['person_id'].tolist()  #person_id column from CSV
    unique_persons = sorted(list(set(person_ids)))
    print(f"Identities: {len(unique_persons)}")
    csv_pids = set(df['person_id'].tolist())

else: # CSV not found, use FRGC_root directory scan
    image_files = [f for f in os.listdir(FRGC_root) if f.endswith('.JPG') or f.endswith('.jpg')]
    print(f"Total images: {len(image_files)}")
    person_ids = [img[:5] for img in image_files] #person id (1st 5 characters)
    unique_persons = sorted(list(set(person_ids)))
    print(f"Identities: {len(unique_persons)}")
    

Found ../frgc cluster/frgc_face_embeddings.csv, person ids loaded from csv
Total images: 39327
Identities: 568


## 2. Initialize Face Detection + Embeddings
## 3. Extract embeddings (parallel processing on both GPUs)

In [4]:
if os.path.exists(csv_path): #use csv if exists
    print("Found frgc_face_embeddings.csv")
    df = pd.read_csv(csv_path)
    drop_cols = ['image_file', 'person_id']
    emb_cols = [c for c in df.columns if c not in drop_cols]
    embeddings = df[emb_cols].values
    valid_person_ids = df['person_id'].tolist()  # Use person_id directly
    valid_image_files = df['image_file'].tolist()

else: #csv does not exist, process images to extract embeddings
    print(f"Processing with GPU 0 and 1")
    def process_batch_worker(gpu_id, batch_data):
        app = FaceAnalysis(name="buffalo_l")
        app.prepare(ctx_id=gpu_id, det_size=(160, 160))
        embs, pids, files = [], [], []
        for img_file, pid in tqdm(batch_data, desc=f"GPU {gpu_id}"):
            img = cv2.imread(os.path.join(FRGC_root, img_file))
            if img is not None:
                faces = app.get(img)
                if len(faces) > 0:
                    embs.append(faces[0].embedding)
                    pids.append(pid)
                    files.append(img_file)
        return {'embs': embs, 'pids': pids, 'files': files}
    
    person_ids_list = [img[:5] for img in image_files]
    mid = len(image_files) // 2
    results = Parallel(n_jobs=2, backend='loky')(
        delayed(process_batch_worker)(gpu_id, list(zip(image_files[start:end], person_ids_list[start:end])))
        for gpu_id, (start, end) in [(0, (0, mid)), (1, (mid, len(image_files)))]
    )
    embeddings = np.vstack([np.array(r['embs']) for r in results])
    valid_person_ids = sum([r['pids'] for r in results], [])
    valid_image_files = sum([r['files'] for r in results], [])

print(f"Embeddings: {embeddings.shape}")
print(f"Valid images: {len(valid_image_files)}")
print(f"Unique person_ids: {len(set(valid_person_ids))}")

Found frgc_face_embeddings.csv
Embeddings: (39327, 512)
Valid images: 39327
Unique person_ids: 568


In [5]:
if os.path.exists(csv_path):
    print(f"{csv_path} already exists.")
else: #save csv
    embeddings_df = pd.DataFrame(embeddings)
    embeddings_df['person_id'] = valid_person_ids
    embeddings_df['image_file'] = valid_image_files
    embeddings_df.to_csv('frgc_face_embeddings2.csv', index=False)
    print(f"Embeddings and person_ids saved to 'frgc_face_embeddings2.csv'")

../frgc cluster/frgc_face_embeddings.csv already exists.


## 4. Cluster by identity

In [6]:
labels_arr = np.array(valid_person_ids) #array of person_ids
unique_ids = np.unique(labels_arr) #unique person ids
identity_indices = {pid: np.where(labels_arr == pid)[0] for pid in unique_ids} #map person_id to indices
identity_clusters = {pid: embeddings[idxs] for pid, idxs in identity_indices.items()} #map person_id to embeddings

In [7]:
cluster_stats = {}
for pid, cluster in identity_clusters.items():
    centroid = cluster.mean(axis=0)
    distances = np.linalg.norm(cluster - centroid, axis=1)
    cluster_stats[pid] = {
        "centroid": centroid,
        "radius": float(distances.max()),
        "dispersion": float(distances.std()),
        "num_images": int(len(cluster))
    }

In [8]:
print(cluster_stats)

{2463: {'centroid': array([ 1.36079760e+00, -5.53617906e-01,  8.19623687e-01, -5.97872120e-01,
        7.04318322e-01,  6.38033939e-01, -2.29981774e-02,  7.24419654e-01,
        1.26011928e+00,  6.89536185e-01, -4.32604623e-01,  4.37730998e-01,
       -9.29212914e-01, -1.88015310e-01,  2.98649382e-01,  1.77353284e-02,
        2.62412907e-02,  1.93048870e-02, -2.61366673e-01,  6.01749572e-01,
       -4.30036073e-02, -1.51317886e+00,  1.53724335e+00, -1.48630439e-01,
        2.90227277e-01, -4.13044099e-01,  9.34751462e-01,  1.05606430e+00,
       -1.50293793e+00, -1.18616631e+00, -4.88096725e-01,  5.07334496e-02,
        1.39240670e-01,  6.09696958e-01, -9.55037789e-02, -9.25457512e-01,
       -1.76883396e+00, -1.88725922e-01,  5.79524863e-01,  1.51770896e+00,
       -1.15441726e+00,  1.01814885e+00,  7.52622860e-03, -8.39216265e-01,
        2.34194277e-01,  1.56428035e-01, -1.26693369e-01,  7.43600427e-01,
        6.68994034e-01, -1.41671365e+00, -6.81525315e-01, -3.96701126e-01,
     

## 5. Intrinsic dimensionality

1. Intrinsic dimensionality of the manifold = m
2. Learn a function that map from R^d -> R^m usig DeepMDS

### 5.1 Intrinsic Dimension Estimation (Sec 3.1. Estimating Intrinsic Dimension)

#### 5.1.1. Global ID

In [9]:
# Global Intrinsic Dimension
print("="*70)
print("Global Intrinsic Dimension Estimation")
print("="*70)

global_id, global_info = estimate_intrinsic_dimension(embeddings, k=15)
print(f"\n Global Intrinsic Dimensionality: {global_id}")
print(f"   Ambient Space: {embeddings.shape[1]}D")
print(f"   Compression factor: {embeddings.shape[1]/global_id}x")
print(f"   r_max: {global_info['r_max']}")
print(f"   sigma: {global_info['sigma']}")

Global Intrinsic Dimension Estimation



 Global Intrinsic Dimensionality: 11.136551061856972
   Ambient Space: 512D
   Compression factor: 45.97473644723057x
   r_max: 11.642777747056423
   sigma: 0.28076911050888786


#### 5.1.2. ID per Identity

In [10]:
# Intrinsic Dimension per Identity
print("="*70)
print("Intrinsic Dimension per Identity")
print("="*70)

id_results = {}
for pid, cluster_emb in identity_clusters.items():
    n = int(cluster_emb.shape[0])

    m_id, info = estimate_intrinsic_dimension(cluster_emb, k=10)  # k más bajo para clusters pequeños
    id_results[pid] = {
        "id": float(m_id) if m_id is not None else np.nan,
        "num_images": n,
        "r_max": info.get("r_max", np.nan),
        "sigma": info.get("sigma", np.nan)
    }

valid_items = [(pid, v) for pid, v in id_results.items() if np.isfinite(v["id"])]
valid_ids = [v["id"] for _, v in valid_items]
nan_count = sum(1 for v in id_results.values() if not np.isfinite(v["id"]))
sorted_ids = sorted(
    valid_items,
    key=lambda x: x[1]["id"]
)
print(f"\nAnalized identities: {len(id_results)}")
print(f"Valid IDs: {len(valid_ids)} | NaN IDs: {nan_count}")
if len(valid_ids) > 0:
    print(f"Average ID: {np.mean(valid_ids):.2f}")
    print(f"Std ID: {np.std(valid_ids):.2f}")
else:
    print("Average ID: n/a")
    print("Std ID: n/a")

print("\n--- Top 10 identities with min ID (compacted) ---")
for pid, info in sorted_ids[:10]:
    print(f"{pid}: ID={info['id']:.2f}, number of images={info['num_images']}")
print("\n--- Top 10 identities with max ID (variation) ---")
for pid, info in sorted_ids[-10:]:
    print(f"{pid}: ID={info['id']:.2f}, number of images={info['num_images']}")

Intrinsic Dimension per Identity

Analized identities: 568
Valid IDs: 565 | NaN IDs: 3
Average ID: 8.90
Std ID: 11.54

--- Top 10 identities with min ID (compacted) ---
4704: ID=1.00, number of images=18
4591: ID=1.44, number of images=6
4538: ID=1.53, number of images=6
4911: ID=1.59, number of images=48
4800: ID=1.63, number of images=6
4710: ID=1.71, number of images=6
4844: ID=1.73, number of images=6
4807: ID=1.74, number of images=6
4912: ID=1.74, number of images=6
4752: ID=1.75, number of images=6

--- Top 10 identities with max ID (variation) ---
4873: ID=57.04, number of images=18
4229: ID=57.30, number of images=84
4789: ID=61.18, number of images=12
4652: ID=63.14, number of images=42
4889: ID=68.54, number of images=18
4580: ID=76.94, number of images=200
4397: ID=78.93, number of images=194
4654: ID=80.00, number of images=18
4604: ID=87.59, number of images=18
4546: ID=99.86, number of images=18


### 5.2. Histograms

In [11]:
id_df = pd.DataFrame(
    [
        {"pid": pid, "id": v["id"], "num_images": v["num_images"]}
        for pid, v in id_results.items()
    ]
).dropna()

##### 5.2.1. ID vs pid - ordered by ID

Exact ID

In [None]:
# ID per pid ordered by ID 
id_df_sorted = id_df.sort_values("id")
identities = id_df_sorted["pid"].astype(str).tolist()
dimensions = id_df_sorted["id"].tolist()
n = len(identities)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 8

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), dimensions, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels(identities, rotation=90, fontsize=6)
ax.set_xlabel("Person ID", fontsize=12)
# Y-axis ticks every 5 images
if dimensions:
    y_max = max(dimensions)
    ax.set_yticks(np.arange(0, y_max + 5, 5))
ax.set_ylabel("Intrinsic Dimension", fontsize=12)
ax.set_title("Intrinsic Dimension by Identity - ordered by ID", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "intrinsic_all_label.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

ID rounded to integer

In [72]:
# ID per pid ordered by ID (rounded to integer)
id_df_sorted = id_df.copy()
id_df_sorted["id_rounded"] = id_df_sorted["id"].round().astype(int)
id_df_sorted = id_df_sorted.sort_values("id_rounded")

identities = id_df_sorted["pid"].astype(str).tolist()
dimensions = id_df_sorted["id_rounded"].tolist()
n = len(identities)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 8

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), dimensions, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels(identities, rotation=90, fontsize=6)
ax.set_xlabel("Person ID", fontsize=12)
if dimensions:
    y_max = max(dimensions)
    ax.set_yticks(np.arange(0, y_max + 5, 5))
ax.set_ylabel("Intrinsic Dimension (rounded to integer)", fontsize=12)
ax.set_title("Intrinsic Dimension by Identity (rounded to integer) - ordered by ID", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "intrinsic_all_label_rounded_integer.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

##### 5.2.2. Number of imgs vs pid - ordered by ID

Exact ID

In [71]:
# Number of images per subject ordered by ID
id_df_sorted = id_df.sort_values("id")
identities = id_df_sorted["pid"].astype(str).tolist()
num_images = id_df_sorted["num_images"].tolist()
n = len(identities)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 8

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), num_images, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels(identities, rotation=90, fontsize=6)
ax.set_xlabel("Person ID (pid)", fontsize=12)
# Y-axis ticks every 5 images
if num_images:
    y_max = max(num_images)
    ax.set_yticks(np.arange(0, y_max + 5, 5))
ax.set_ylabel("Number of images", fontsize=12)
ax.set_title("Number of images per subject (pid) - ordered by ID", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "num_images_all_label_ordered_by_id.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

ID rounded to integer

In [70]:
# Number of images per subject ordered by ID (rounded to integer)
id_df_sorted = id_df.copy()
id_df_sorted["id_rounded"] = id_df_sorted["id"].round().astype(int)
id_df_sorted = id_df_sorted.sort_values("id_rounded")

identities = id_df_sorted["pid"].astype(str).tolist()
num_images = id_df_sorted["num_images"].tolist()
n = len(identities)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 8

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), num_images, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels(identities, rotation=90, fontsize=6)
ax.set_xlabel("Person ID (pid)", fontsize=12)
if num_images:
    y_max = max(num_images)
    ax.set_yticks(np.arange(0, y_max + 5, 5))
ax.set_ylabel("Number of images", fontsize=12)
ax.set_title("Number of images per subject (pid) - ordered by rounded integer ID", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "num_images_all_label_ordered_by_id_integer.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

##### 5.2.3. Number of imgs vs pid - ordered by Number of imgs

In [69]:
# Number of images per subject ordered by number of images
id_df_sorted = id_df.sort_values("num_images")
identities = id_df_sorted["pid"].astype(str).tolist()
num_images = id_df_sorted["num_images"].tolist()
n = len(identities)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 8

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), num_images, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels(identities, rotation=90, fontsize=6)
ax.set_xlabel("Person ID (pid)", fontsize=12)
if num_images:
    y_max = max(num_images)
    ax.set_yticks(np.arange(0, y_max + 5, 5))
ax.set_ylabel("Number of images", fontsize=12)
ax.set_title("Number of images per subject (pid) - ordered by number of images", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "num_images_all_label_ordered_by_num_images.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

##### 5.2.4. ID vs Number of imgs - ordered by ID

Exact ID

In [68]:
# Number of images vs ID ordered by ID
id_df_sorted = id_df.sort_values("id")
ids = id_df_sorted["id"].tolist()
num_images = id_df_sorted["num_images"].tolist()
n = len(ids)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 8

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), num_images, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels([str(v) for v in ids], rotation=90, fontsize=6)
ax.set_xlabel("Intrinsic Dimension", fontsize=12)
if num_images:
    y_max = max(num_images)
    ax.set_yticks(np.arange(0, y_max + 5, 5))
ax.set_ylabel("Number of images", fontsize=12)
ax.set_title("Number of images by Intrinsic Dimension - ordered by ID", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "num_images_by_ids.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

Rounded to integer - ID

In [67]:
# Number of images vs ID rounded to integer
id_df_sorted = id_df.copy()
id_df_sorted["id_rounded"] = id_df_sorted["id"].round().astype(int)
id_df_sorted = id_df_sorted.sort_values("id_rounded")

ids = id_df_sorted["id_rounded"].tolist()
num_images = id_df_sorted["num_images"].tolist()
n = len(ids)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 8

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), num_images, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels([str(v) for v in ids], rotation=90, fontsize=6)
if num_images:
    y_max = max(num_images)
    ax.set_yticks(np.arange(0, y_max + 5, 5))
ax.set_xlabel("Intrinsic Dimension (rounded to integer)", fontsize=12)
ax.set_ylabel("Number of images", fontsize=12)
ax.set_title("Number of images by Intrinsic Dimension - ordered by ID (rounded to integer)", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "num_images_by_id_integers.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

##### 5.2.5. Frequency of ID values

Exact ID

In [66]:
# Frequency of exact ID values
id_counts = id_df["id"].value_counts().sort_index()
ids = id_counts.index.astype(str).tolist()
freqs = id_counts.values.tolist()
n = len(ids)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 6

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), freqs, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels(ids, rotation=90, fontsize=6)
ax.set_xlabel("Intrinsic Dimension", fontsize=12)
ax.set_ylabel("Frequency", fontsize=12)
ax.set_title("Exact Frequency of Estimated Intrinsic Dimensions", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "id_frequency_all_label.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

Rounded to integer - ID

In [60]:
# Frequency of ID values rounded to integer 
id_rounded = id_df["id"].round().astype(int)
id_counts = id_rounded.value_counts().sort_index()
ids = id_counts.index.astype(str).tolist()
freqs = id_counts.values.tolist()
n = len(ids)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 6

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), freqs, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels(ids, rotation=90, fontsize=6)
ax.set_xlabel("Intrinsic Dimension (rounded to integer)", fontsize=12)
ax.set_ylabel("Frequency", fontsize=12)
ax.set_title("Frequency of Estimated Intrinsic Dimensions (rounded to integer)", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "id_frequency_integers.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

Rounded to 1 decimal - ID

In [58]:
# Frequency of ID values rounded to 1 decimal
id_rounded = id_df["id"].round(1)
id_counts = id_rounded.value_counts().sort_index()
ids = id_counts.index.astype(str).tolist()
freqs = id_counts.values.tolist()
n = len(ids)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 6

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), freqs, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels(ids, rotation=90, fontsize=6)
ax.set_xlabel("Intrinsic Dimension (rounded to 1 decimal)", fontsize=12)
ax.set_ylabel("Frequency", fontsize=12)
ax.set_title("Frequency of Estimated Intrinsic Dimensions (rounded to 1 decimal)", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "id_frequency_1_decimal.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

Rounded to 2 decimals - ID

In [57]:
# Frequency of ID values rounded to 2 decimals 
id_rounded = id_df["id"].round(2)
id_counts = id_rounded.value_counts().sort_index()
ids = id_counts.index.astype(str).tolist()
freqs = id_counts.values.tolist()
n = len(ids)
bar_width = 0.8
width_per_bar = 0.10
fig_width = max(18, n * width_per_bar)
fig_height = 6

fig, ax = plt.subplots(figsize=(fig_width, fig_height))
ax.bar(range(n), freqs, color="blue", alpha=0.8, width=bar_width)
ax.set_xticks(range(n))
ax.set_xticklabels(ids, rotation=90, fontsize=6)
ax.set_xlabel("Intrinsic Dimension (rounded to 2 decimals)", fontsize=12)
ax.set_ylabel("Frequency", fontsize=12)
ax.set_title("Frequency of Estimated Intrinsic Dimensions (rounded to 2 decimals)", fontsize=14)
ax.grid(True, alpha=0.1, axis="y")
plt.tight_layout()

output_path = "id_frequency_2_decimal.png"
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

#### 5.2.6. ID vs Smooth Density

it show a Kernel Density Estimate smooth curve in red and stats

to see how concentrate is the data en different values, for the results we can see:
- is unimodal = 1peak (in ID 4.5 approx)
- the paper say that if the curve is smooth, the estimation method is satble

In [77]:
fig, ax = plt.subplots(figsize=(12, 6))

#ID values
ids_array = id_df['id'].values
ax.hist(ids_array, bins=30, density=True, alpha=0.7, color='blue', edgecolor='black', label='Histogram')

#smoothed density curve
kde = gaussian_kde(ids_array)
x_range = np.linspace(ids_array.min(), ids_array.max(), 200)
density = kde(x_range)
ax.plot(x_range, density, 'r-', linewidth=2.5, label='Kernel Density Estimate')

ax.set_xlabel('Intrinsic Dimension (ID)', fontsize=12)
ax.set_ylabel('Density', fontsize=12)
ax.set_title('Distribution of Intrinsic Dimensions (ID) - Histogram with Smoothed Density', fontsize=14)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()

output_path = 'id_distribution_density.png'
fig.savefig(output_path, dpi=150, bbox_inches='tight')
plt.close(fig)

display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

print(f"ID Statistics:")
print(f"  Mean: {np.mean(ids_array):.3f}")
print(f"  Median: {np.median(ids_array):.3f}")
print(f"  Std Dev: {np.std(ids_array):.3f}")
print(f"  Min: {np.min(ids_array):.3f}")
print(f"  Max: {np.max(ids_array):.3f}")

ID Statistics:
  Mean: 8.902
  Median: 5.260
  Std Dev: 11.542
  Min: 1.000
  Max: 99.858


- THe average, the face embeddings need around 8.9 dimensions to capture all the information
- Std Dev high, a lot of variation between identities

- Mean > Median = 8.9 > 5.3, it means there are big outliers so the media is bigger (as the outlier 99.9).
- The Std Dev is high probably by:
    - estimation method inestable in some cases
    - some clusters have few points (less than k=10 neighbors)
    - posibles outliers
- Dataset FRGC have a lot of variation to have 1-100D, maybe they are
    - very heterogeneuos
    - the estimation method is sensible to the quantity of imgs per identity

#### 5.2.7. Number of neighbors vs ID (local density)

- local neigborhood density for each identity (k=10), if it is high is more close, if it lows more disperse data per idenity
- bubble size is the number imgs per identity
- axis X: ID // axis Y: local neighborhood density (inverse of average distance)

In [None]:
#count neighbors within a distance threshold for each embedding
k_neighbors = 10 
nbrs = NearestNeighbors(n_neighbors=k_neighbors + 1).fit(embeddings)  # +1 to include self
distances, indices = nbrs.kneighbors(embeddings)

distances = distances[:, 1:]  #exclude distance to self (0)

#for each identity, average neighborhood density (inverse of average distance)
neighbor_analysis = {}
for pid in id_results.keys():
    cluster_indices = identity_indices[pid]
    cluster_distances = distances[cluster_indices]  # distances from each point in cluster for the neighbors 
    avg_neighbor_distance = np.mean(cluster_distances)  # average distance to neighbors for this identity
    neighborhood_density = 1.0 / (avg_neighbor_distance + 1e-6)  # avoid division by zero, transfom to density
    
    neighbor_analysis[pid] = {
        'avg_neighbor_distance': avg_neighbor_distance,
        'neighborhood_density': neighborhood_density,
        'id': id_results[pid]['id'],
        'num_images': id_results[pid]['num_images']
    }

neighbor_df = pd.DataFrame(neighbor_analysis).T
neighbor_df = neighbor_df.dropna()

#plot
fig, ax = plt.subplots(figsize=(12, 7))
scatter = ax.scatter(neighbor_df['id'], neighbor_df['neighborhood_density'], 
                    s=neighbor_df['num_images']*5,  # Size by number of images
                    alpha=0.6, c=neighbor_df['id'], cmap='viridis', edgecolors='black', linewidth=0.5)

ax.set_xlabel('Intrinsic Dimension (ID)', fontsize=12)
ax.set_ylabel('Local Neighborhood Density (1/avg_distance)', fontsize=12)
ax.set_title(f'Neighborhood Density vs ID (k={k_neighbors} neighbors)\nBubble size = num images per identity', fontsize=14)
ax.grid(True, alpha=0.3)
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('ID Value', fontsize=11)

#trend line
z = np.polyfit(neighbor_df['id'].dropna(), neighbor_df['neighborhood_density'].dropna(), 1)
p = np.poly1d(z)
x_trend = np.linspace(neighbor_df['id'].min(), neighbor_df['id'].max(), 100)
ax.plot(x_trend, p(x_trend), 'r--', linewidth=2, alpha=0.8, label=f'Trend line')
ax.legend(fontsize=10)
plt.tight_layout()

output_path = 'neighbors_vs_id.png'
fig.savefig(output_path, dpi=150, bbox_inches='tight')
plt.close(fig)
display(HTML(
    f"<div style='width:100%; overflow:auto; border:1px solid #ddd; padding:6px'>"
    f"<img src='{output_path}' style='height:auto;'>"
    f"</div>"
))

#correlation
corr_pearson, p_pearson = pearsonr(neighbor_df['id'], neighbor_df['neighborhood_density'])
corr_spearman, p_spearman = spearmanr(neighbor_df['id'], neighbor_df['neighborhood_density'])
print(f"\nNeighborhood Density Analysis:")
print(f"  Pearson correlation (ID vs density): {corr_pearson:.4f} (p={p_pearson:.4e})")
print(f"  Spearman correlation (ID vs density): {corr_spearman:.4f} (p={p_spearman:.4e})")
print(f"  Mean neighborhood density: {neighbor_df['neighborhood_density'].mean():.4f}")
print(f"  Std neighborhood density: {neighbor_df['neighborhood_density'].std():.4f}")


Neighborhood Density Analysis:
  Pearson correlation (ID vs density): 0.0383 (p=3.6372e-01)
  Spearman correlation (ID vs density): 0.0769 (p=6.7892e-02)
  Mean neighborhood density: 0.0882
  Std neighborhood density: 0.0160


Results:
- ID increases -> Density increases a little bit (complex identities = tight)
- pearson correlation (sensitive with outliers, +1 perfect positive corr - 0 no linear relation - -1 perfect negative corr)
    - here is closer to 0, weak relationship
    - p value= 0.364 >0.05, not statiscally  signifcant = could be random,not reliable this corr
- spearman corr (robuster to outliers than pearson)
    - still close to 0, weak relationship
    - p value= 0.364 >0.05, not statiscally  signifcant but is marginally signifcnat near the 0.05
    - ID and local density are almost independent
    - Higher ID → slightly higher density
- p value (< 0.05 corr significant, >0.5 random, not reliable)
- low variation in std = most identities have similar local density

#### 5.2.8. Outlier Analysis - Extreme ID Values

In [None]:
print("="*70)
print("OUTLIER ANALYSIS: Extreme ID Values")
print("="*70)

# quartiles for outlier detection
Q1 = id_df['id'].quantile(0.25)
Q3 = id_df['id'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"\nQuartile Analysis:")
print(f"  Q1 (25%): {Q1:.3f}")
print(f"  Q3 (75%): {Q3:.3f}")
print(f"  IQR: {IQR:.3f}")
print(f"  Outlier bounds: [{lower_bound:.3f}, {upper_bound:.3f}]")

#finding outliers
low_outliers = id_df[id_df['id'] < lower_bound].sort_values('id')
high_outliers = id_df[id_df['id'] > upper_bound].sort_values('id', ascending=False)
print(f"\n--- LOW OUTLIERS (ID < {lower_bound:.3f}) ---")
print(f"Count: {len(low_outliers)}")
if len(low_outliers) > 0:
    for idx, row in low_outliers.iterrows():
        print(f"  PID: {row['pid']}, ID: {row['id']:.3f}, Num Images: {row['num_images']}")
print(f"\n--- HIGH OUTLIERS (ID > {upper_bound:.3f}) ---")
print(f"Count: {len(high_outliers)}")
if len(high_outliers) > 0:
    for idx, row in high_outliers.iterrows():
        print(f"  PID: {row['pid']}, ID: {row['id']:.3f}, Num Images: {row['num_images']}")

# Top 10
print(f"\n--- TOP 10 HIGHEST ID VALUES ---")
top_10_high = id_df.nlargest(10, 'id')
for idx, row in top_10_high.iterrows():
    print(f"  PID: {row['pid']}, ID: {row['id']:.3f}, Num Images: {row['num_images']}")
print(f"\n--- TOP 10 LOWEST ID VALUES ---")
top_10_low = id_df.nsmallest(10, 'id')
for idx, row in top_10_low.iterrows():
    print(f"  PID: {row['pid']}, ID: {row['id']:.3f}, Num Images: {row['num_images']}")

#correlation between ID and number of images
corr_id_imgs = id_df['id'].corr(id_df['num_images'])
print(f"\nCorrelation between ID and Number of Images: {corr_id_imgs:.4f}")
if abs(corr_id_imgs) > 0.3:
    print(f"  → Moderate/Strong correlation detected")
else:
    print(f"  → Weak/No correlation")


OUTLIER ANALYSIS: Extreme ID Values

Quartile Analysis:
  Q1 (25%): 3.751
  Q3 (75%): 8.513
  IQR: 4.762
  Outlier bounds: [-3.392, 15.657]

--- LOW OUTLIERS (ID < -3.392) ---
Count: 0

--- HIGH OUTLIERS (ID > 15.657) ---
Count: 74
  PID: 4546.0, ID: 99.858, Num Images: 18.0
  PID: 4604.0, ID: 87.591, Num Images: 18.0
  PID: 4654.0, ID: 79.999, Num Images: 18.0
  PID: 4397.0, ID: 78.934, Num Images: 194.0
  PID: 4580.0, ID: 76.937, Num Images: 200.0
  PID: 4889.0, ID: 68.536, Num Images: 18.0
  PID: 4652.0, ID: 63.141, Num Images: 42.0
  PID: 4789.0, ID: 61.176, Num Images: 12.0
  PID: 4229.0, ID: 57.298, Num Images: 84.0
  PID: 4873.0, ID: 57.040, Num Images: 18.0
  PID: 4440.0, ID: 57.005, Num Images: 54.0
  PID: 4669.0, ID: 56.948, Num Images: 12.0
  PID: 4374.0, ID: 53.418, Num Images: 90.0
  PID: 4783.0, ID: 50.821, Num Images: 24.0
  PID: 4802.0, ID: 41.403, Num Images: 18.0
  PID: 4261.0, ID: 37.390, Num Images: 116.0
  PID: 4612.0, ID: 34.694, Num Images: 24.0
  PID: 4482.0, ID