In [2]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from local_utils import init_embeddings
from scipy.spatial.distance import euclidean

# Constants for file paths
EMBEDDINGS_PATH = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/model_240000_DoppelVer_All_112x112_outputs.pth"
EMBEDDINGS_IMAGE_PATH = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/model_240000_DoppelVer_All_112x112_image_paths.txt"
IMAGE_DIR = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/DoppelVer_All_112x112"
BASE_PATH = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/results"
CLASS_NUM_NAME_PATH = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/class_num_name.csv"

embeddings = init_embeddings(EMBEDDINGS_PATH, EMBEDDINGS_IMAGE_PATH, IMAGE_DIR)

In [2]:
def perform_clustering_per_person(embeddings, base_path, n_clusters=1):
    # Ensure the base directory exists
    os.makedirs(base_path, exist_ok=True)
    
    results = []
    filename = os.path.join(base_path, "KMeans_Clustering_Per_Person.csv")

    # Exclude the last 3 columns: 'class', 'path', and 'class_num'
    embedding_columns = embeddings.columns[:-3]

    for person_index in tqdm(embedding_columns, desc="Clustering Each Person"):
        # Extract and scale data for the current person (column)
        person_data = embeddings[person_index].values.reshape(-1, 1)
        person_data_scaled = StandardScaler().fit_transform(person_data)
        
        # Apply KMeans clustering
        clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        labels = clusterer.fit_predict(person_data_scaled)
        
        # Collect results
        for idx, label in enumerate(labels):
            results.append({
                'person_index': person_index,
                'data_point_index': idx,
                'label': label
            })

    # Save all results at once to minimize file I/O operations
    results_df = pd.DataFrame(results)
    results_df.to_csv(filename, index=False)
    
    return results_df


In [3]:
def find_midpoint_and_furthest_distance_per_person(embeddings, base_path, class_num_name_df):
    # Ensure the base directory exists
    os.makedirs(base_path, exist_ok=True)
    
    midpoints = []
    filename = os.path.join(base_path, "Midpoints_Per_Person_With_Class_Names.csv")

    for person_index in tqdm(embeddings.columns[:-3], desc="Calculating Midpoints for Each Person"):
        # Extract data for the current person (column)
        person_data = embeddings[person_index].values
        
        # Calculate the midpoint (mean) of the cluster
        midpoint = np.mean(person_data)
        
        # Calculate the distance from the midpoint to each point in the cluster
        distances = np.abs(person_data - midpoint)
        
        # Find the maximum distance (furthest point)
        max_distance = np.max(distances)
        
        # Get the corresponding class_num and class_name
        class_num = embeddings.loc[:, person_index].name
        class_name_row = class_num_name_df[class_num_name_df['class_num'] == class_num]

        if class_name_row.empty:
            #print(f"Class number {class_num} not found in class_num_name_df")
            continue

        class_name = class_name_row['class'].values[0]
        
        # Collect results
        midpoints.append({
            'person_index': person_index,
            'class_num': class_num,
            'class': class_name,
            'midpoint': midpoint,
            'max_distance': max_distance
        })

    # Convert midpoints to DataFrame
    midpoints_df = pd.DataFrame(midpoints)
    
    # Save all results at once to minimize file I/O operations
    midpoints_df.to_csv(filename, index=False)
    
    return midpoints_df

In [4]:
def calculate_difference(midpoints_df, output_path):
    # Calculate the difference
    midpoints_df['difference'] = midpoints_df['max_distance'] - midpoints_df['midpoint']
    
    # Save to CSV
    output_file = os.path.join(output_path, "Midpoints_With_Difference.csv")
    midpoints_df.to_csv(output_file, index=False)

In [5]:
def create_hyperspheres_and_calculate_overlap(embeddings, base_path, class_num_name_df):
    # Ensure the base directory exists
    os.makedirs(base_path, exist_ok=True)
    
    hyperspheres = []
    overlap_results = []
    filename = os.path.join(base_path, "Hyperspheres_Per_Person.csv")
    overlap_filename = os.path.join(base_path, "Hypersphere_Overlap_Per_Person.csv")

    for person_index in tqdm(embeddings.columns[:-3], desc="Calculating Hyperspheres for Each Person"):
        # Extract data for the current person (column)
        person_data = embeddings[person_index].values
        
        # Calculate the midpoint (mean) of the cluster
        midpoint = np.mean(person_data)
        
        # Calculate the distance from the midpoint to each point in the cluster
        distances = np.abs(person_data - midpoint)
        
        # Find the maximum distance (furthest point)
        max_distance = np.max(distances)
        
        # Get the corresponding class_num and class_name
        class_num = embeddings.loc[:, person_index].name
        class_name_row = class_num_name_df[class_num_name_df['class_num'] == class_num]

        if class_name_row.empty:
            continue

        class_name = class_name_row['class'].values[0]
        
        # Collect results
        hyperspheres.append({
            'person_index': person_index,
            'class_num': class_num,
            'class': class_name,
            'midpoint': midpoint,
            'radius': max_distance  # Radius is the same as max_distance
        })

    # Calculate overlap with other hyperspheres
    for i in range(len(hyperspheres)):
        for j in range(i + 1, len(hyperspheres)):
            hypersphere_1 = hyperspheres[i]
            hypersphere_2 = hyperspheres[j]
            
            # Calculate distance between midpoints (since midpoints are scalars, just use abs difference)
            distance_between_centers = abs(hypersphere_1['midpoint'] - hypersphere_2['midpoint'])
            
            # Check for overlap
            if distance_between_centers < (hypersphere_1['radius'] + hypersphere_2['radius']):
                overlap_percentage = (1 - distance_between_centers / (hypersphere_1['radius'] + hypersphere_2['radius'])) * 100
                overlap_results.append({
                    'person_index_1': hypersphere_1['person_index'],
                    'class_1': hypersphere_1['class'],
                    'person_index_2': hypersphere_2['person_index'],
                    'class_2': hypersphere_2['class'],
                    'overlap_percentage': overlap_percentage
                })

    # Convert hyperspheres to DataFrame
    hyperspheres_df = pd.DataFrame(hyperspheres)
    
    # Save all results at once to minimize file I/O operations
    hyperspheres_df.to_csv(filename, index=False)

    # Convert overlap results to DataFrame
    overlap_df = pd.DataFrame(overlap_results)
    
    # Save overlap results to CSV
    overlap_df.to_csv(overlap_filename, index=False)
    
    return hyperspheres_df, overlap_df


In [9]:
# Perform Clustering
perform_clustering_per_person(embeddings, BASE_PATH)

# Find Midpoints for Each Person and Include Class Names
class_num_name_df = pd.read_csv(CLASS_NUM_NAME_PATH)
midpoints = find_midpoint_and_furthest_distance_per_person(embeddings, BASE_PATH, class_num_name_df)
calculate_difference(midpoints, BASE_PATH)

csv_path = os.path.join(BASE_PATH, "Midpoints_Per_Person_With_Class_Names.csv")
midpoints_df = pd.read_csv(csv_path)
#print(midpoints_df.head())


Clustering Each Person: 100%|██████████| 512/512 [00:12<00:00, 39.62it/s]
Calculating Midpoints for Each Person: 100%|██████████| 512/512 [00:00<00:00, 1359.02it/s]


In [6]:

class_num_name_df = pd.read_csv(CLASS_NUM_NAME_PATH)
hyperspheres, overlaps = create_hyperspheres_and_calculate_overlap(embeddings, BASE_PATH, class_num_name_df)

hypersphere_file = os.path.join(BASE_PATH, "Hyperspheres_Per_Person.csv")
overlap_file = os.path.join(BASE_PATH, "Hypersphere_Overlap_Per_Person.csv")
    
hyperspheres_df = pd.read_csv(hypersphere_file)
overlaps_df = pd.read_csv(overlap_file)


Calculating Hyperspheres for Each Person: 100%|██████████| 512/512 [00:00<00:00, 1366.01it/s]


In [7]:
import pandas as pd

# Define the file paths
input_csv_path = '/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/results/Hypersphere_Overlap_Per_Person.csv'  
output_csv_path = '/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/results/Hypersphere_Overlap_ordered.csv' 

# Read the CSV file
df = pd.read_csv(input_csv_path)

# Sort the DataFrame by the overlap_percentage column in ascending order
sorted_df = df.sort_values(by='overlap_percentage')

# Save the sorted DataFrame to a new CSV file
sorted_df.to_csv(output_csv_path, index=False)

print(f"Sorted CSV saved to {output_csv_path}")


Sorted CSV saved to /home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/results/Hypersphere_Overlap_ordered.csv


In [2]:
# Define the file paths
input_csv_path = '/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/results/Hypersphere_Overlap_Per_Person.csv'  # Update with your sorted input CSV path
output_graphs_path = '/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/results/Graphs'  # Update with your desired output directory for graphs

os.makedirs(output_graphs_path, exist_ok=True)

# Read the sorted CSV file
df = pd.read_csv(input_csv_path)

# Get unique classes
unique_classes = df['class_1'].unique()

# Plot a graph for each class
for class_name in unique_classes:
    # Filter data for the current class
    class_df = df[df['class_1'] == class_name]

    # Plotting
    plt.figure(figsize=(12, 6))
    plt.plot(range(len(class_df)), class_df['overlap_percentage'], marker='o', linestyle='-', color='b', markersize=5)
    
    # Highlight points with significantly lower overlap
    low_overlap_threshold = 80
    low_overlap_points = class_df[class_df['overlap_percentage'] < low_overlap_threshold]
    plt.scatter(low_overlap_points.index, low_overlap_points['overlap_percentage'], color='red', label='Low Overlap (< 80%)')

    plt.title(f'Overlap Percentage Changes for {class_name}', fontsize=16)
    plt.xlabel('Comparison Index', fontsize=14)
    plt.ylabel('Overlap Percentage', fontsize=14)
    plt.ylim(0, 100)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend()

    # Save the plot
    graph_path = os.path.join(output_graphs_path, f'{class_name}_overlap_changes.png')
    plt.savefig(graph_path, dpi=300)
    plt.close()

In [2]:
# Define the file paths
input_csv_path = '/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/results/Hypersphere_Overlap_Per_Person.csv'  # Update with your sorted input CSV path
output_inspection_path = '/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/results/Inspection'  # Update with your desired output directory for inspection

os.makedirs(output_inspection_path, exist_ok=True)

# Read the sorted CSV file
df = pd.read_csv(input_csv_path)

# Identify comparisons with high and low overlap percentages
high_overlap_threshold = 99
low_overlap_threshold = 80

high_overlap_df = df[df['overlap_percentage'] >= high_overlap_threshold]
low_overlap_df = df[df['overlap_percentage'] < low_overlap_threshold]

# Save high and low overlap comparisons for further inspection
high_overlap_file = os.path.join(output_inspection_path, 'high_overlap_comparisons.csv')
low_overlap_file = os.path.join(output_inspection_path, 'low_overlap_comparisons.csv')

high_overlap_df.to_csv(high_overlap_file, index=False)
low_overlap_df.to_csv(low_overlap_file, index=False)



High overlap comparisons (>= 99%):
       person_index_1          class_1  person_index_2           class_2  \
2                   0  Abigail_Spencer               3       Alex_Newell   
9                   0  Abigail_Spencer              10       Allen_Leech   
21                  0  Abigail_Spencer              22  Annabelle_Wallis   
47                  0  Abigail_Spencer              48     Bridget_Regan   
57                  0  Abigail_Spencer              58    Cara_Delevigne   
...               ...              ...             ...               ...   
75829             382    William_Dafoe             385    Zachary_Quinto   
75832             382    William_Dafoe             388   Zooey_Deschanel   
75842             384       Zach_Braff             387       Zoey_Deutch   
75845             385   Zachary_Quinto             386       Zoe_Saldana   
75847             385   Zachary_Quinto             388   Zooey_Deschanel   

       overlap_percentage  
2               99.49311