In [1]:
import os
import pandas as pd
import numpy as np
import torch
import ast
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.spatial.distance import cityblock, chebyshev, minkowski, cosine, mahalanobis, euclidean
from tqdm import tqdm
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from local_utils import init_embeddings
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import directed_hausdorff
from scipy.special import betainc
from scipy.integrate import quad
from scipy.optimize import minimize_scalar

# Constants for file paths
EMBEDDINGS_PATH = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/model_240000_DoppelVer_All_112x112_outputs.pth"
EMBEDDINGS_IMAGE_PATH = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/model_240000_DoppelVer_All_112x112_image_paths.txt"
IMAGE_DIR = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/DoppelVer_All_112x112"
BASE_PATH = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/00hypersphere_Comparison/results"
CLASS_NUM_NAME_PATH = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/class_num_name.csv"

embeddings = init_embeddings(EMBEDDINGS_PATH, EMBEDDINGS_IMAGE_PATH, IMAGE_DIR)

In [2]:
def perform_clustering_per_person(embeddings, base_path, n_clusters=1):
    # Ensure the base directory exists
    os.makedirs(base_path, exist_ok=True)
    
    results = []
    filename = os.path.join(base_path, "KMeans_Clustering_Per_Person.csv")

    # Exclude the last 3 columns: 'class', 'path', and 'class_num'
    embedding_columns = embeddings.columns[:-3]

    for person_index in tqdm(embedding_columns, desc="Clustering Each Person"):
        # Extract and scale data for the current person (column)
        person_data = embeddings[person_index].values.reshape(-1, 1)
        person_data_scaled = StandardScaler().fit_transform(person_data)
        
        # Apply KMeans clustering
        clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        labels = clusterer.fit_predict(person_data_scaled)
        
        # Collect results
        for idx, label in enumerate(labels):
            results.append({
                'person_index': person_index,
                'data_point_index': idx,
                'label': label
            })

    # Save all results at once to minimize file I/O operations
    results_df = pd.DataFrame(results)
    results_df.to_csv(filename, index=False)
    
    return results_df


In [3]:
def chebyshev_find_midpoint_and_furthest_distance_per_person(embeddings, base_path, class_num_name_df):
    base_path = os.path.join(base_path, "Chebyshev")
    os.makedirs(base_path, exist_ok=True)
    
    midpoints = []
    filename = os.path.join(base_path, "Chebyshev_Midpoints_Per_Person_With_Class_Names.csv")

    for person_index in tqdm(embeddings['class_num'].unique(), desc="Calculating Midpoints for Each Person"):
        # Extract data for the current person (class_num)
        person_data = embeddings[embeddings['class_num'] == person_index].iloc[:, :-3].values

        # Calculate the midpoint (mean) of the cluster
        midpoint = np.mean(person_data, axis=0)
        
        # Calculate the Chebyshev distance from the midpoint to each point in the cluster
        distances = np.array([chebyshev(midpoint, point) for point in person_data])
        
        # Find the maximum distance (furthest point)
        max_distance = np.max(distances)
        
        # Get the corresponding class_num and class_name
        class_name_row = class_num_name_df[class_num_name_df['class_num'] == person_index]

        if class_name_row.empty:
            continue

        class_name = class_name_row['class'].values[0]

        # Format the midpoint array as a string with commas
        midpoint_str = ", ".join(map(str, midpoint))
        
        # Collect results
        midpoints.append({
            'person_index': person_index,
            'class_num': person_index,
            'class': class_name,
            'midpoint': midpoint_str,
            'max_distance': max_distance
        })

    # Convert midpoints to DataFrame
    midpoints_df = pd.DataFrame(midpoints)
    
    # Save all results at once to minimize file I/O operations
    midpoints_df.to_csv(filename, index=False)
    
    return midpoints_df


In [4]:
def minkowski_find_midpoint_and_furthest_distance_per_person(embeddings, base_path, class_num_name_df, p=2):
    base_path = os.path.join(base_path, "Minkowski")
    os.makedirs(base_path, exist_ok=True)
    
    midpoints = []
    filename = os.path.join(base_path, "Minkowski_Midpoints_Per_Person_With_Class_Names.csv")

    for person_index in tqdm(embeddings['class_num'].unique(), desc="Calculating Midpoints for Each Person"):
        # Extract data for the current person (class_num)
        person_data = embeddings[embeddings['class_num'] == person_index].iloc[:, :-3].values
        
        # Calculate the midpoint (mean) of the cluster
        midpoint = np.mean(person_data, axis=0)
        
        # Calculate the Minkowski distance from the midpoint to each point in the cluster
        distances = np.array([minkowski(midpoint, point, p) for point in person_data])
        
        # Find the maximum distance (furthest point)
        max_distance = np.max(distances)
        
        # Get the corresponding class_num and class_name
        class_name_row = class_num_name_df[class_num_name_df['class_num'] == person_index]

        if class_name_row.empty:
            continue

        class_name = class_name_row['class'].values[0]

        # Format the midpoint array as a string with commas
        midpoint_str = ", ".join(map(str, midpoint))
        
        # Collect results
        midpoints.append({
            'person_index': person_index,
            'class_num': person_index,
            'class': class_name,
            'midpoint': midpoint_str,
            'max_distance': max_distance
        })

    # Convert midpoints to DataFrame
    midpoints_df = pd.DataFrame(midpoints)
    
    # Save all results at once to minimize file I/O operations
    midpoints_df.to_csv(filename, index=False)
    
    return midpoints_df


In [5]:
def cosine_find_midpoint_and_furthest_distance_per_person(embeddings, base_path, class_num_name_df):
    base_path = os.path.join(base_path, "Cosine")
    os.makedirs(base_path, exist_ok=True)
    
    midpoints = []
    filename = os.path.join(base_path, "Cosine_Midpoints_Per_Person_With_Class_Names.csv")

    for person_index in tqdm(embeddings['class_num'].unique(), desc="Calculating Midpoints for Each Person"):
        # Extract data for the current person (class_num)
        person_data = embeddings[embeddings['class_num'] == person_index].iloc[:, :-3].values
        
        # Calculate the midpoint (mean) of the cluster
        midpoint = np.mean(person_data, axis=0)
        
        # Calculate the Cosine distance from the midpoint to each point in the cluster
        distances = np.array([cosine(midpoint, point) for point in person_data])
        
        # Find the maximum distance (furthest point)
        max_distance = np.max(distances)
        
        # Get the corresponding class_num and class_name
        class_name_row = class_num_name_df[class_num_name_df['class_num'] == person_index]

        if class_name_row.empty:
            continue

        class_name = class_name_row['class'].values[0]

        # Format the midpoint array as a string with commas
        midpoint_str = ", ".join(map(str, midpoint))
        
        # Collect results
        midpoints.append({
            'person_index': person_index,
            'class_num': person_index,
            'class': class_name,
            'midpoint': midpoint_str,
            'max_distance': max_distance
        })

    # Convert midpoints to DataFrame
    midpoints_df = pd.DataFrame(midpoints)
    
    # Save all results at once to minimize file I/O operations
    midpoints_df.to_csv(filename, index=False)
    
    return midpoints_df

In [14]:
def create_directory(base_path):
    os.makedirs(base_path, exist_ok=True)

# Read midpoints and max distances from CSV
def read_csv(midpoints_csv_path):
    return pd.read_csv(midpoints_csv_path)

# Function to create hyperspheres and calculate overlap
def create_hyperspheres_and_calculate_overlap_from_csv(midpoints_csv_path, overlap_output_base_path):
    create_directory(overlap_output_base_path)
    
    # Read midpoints and max distances from CSV
    midpoints_df = read_csv(midpoints_csv_path)
    
    hyperspheres = []
    overlap_results = []
    overlap_filename = os.path.join(overlap_output_base_path, "Hypersphere_Overlap_Per_Person.csv")

    # Loop over each row in the midpoints DataFrame
    for index, row in tqdm(midpoints_df.iterrows(), total=midpoints_df.shape[0], desc="Calculating Hyperspheres for Each Person"):
        # Extract data from the row
        person_index = row['person_index']
        class_num = row['class_num']
        class_name = row['class']
        midpoint_str = row['midpoint']
        midpoint = np.array([float(x.strip()) for x in midpoint_str.split(',')])
        max_distance = row['max_distance']
        
        # Collect results
        hyperspheres.append({
            'person_index': person_index,
            'class_num': class_num,
            'class': class_name,
            'midpoint': midpoint,
            'radius': max_distance  # Radius is the same as max_distance
        })

    # Calculate overlap with other hyperspheres
    for i in range(len(hyperspheres)):
        for j in range(i + 1, len(hyperspheres)):
            hypersphere_1 = hyperspheres[i]
            hypersphere_2 = hyperspheres[j]
            
            # Calculate distance between midpoints using Euclidean distance
            distance_between_centers = euclidean(hypersphere_1['midpoint'], hypersphere_2['midpoint'])
            
            # Check for overlap
            if distance_between_centers < (hypersphere_1['radius'] + hypersphere_2['radius']):
                overlap_percentage = (1 - distance_between_centers / (hypersphere_1['radius'] + hypersphere_2['radius'])) * 100
                overlap_results.append({
                    'person_index_1': hypersphere_1['person_index'],
                    'class_1': hypersphere_1['class'],
                    'person_index_2': hypersphere_2['person_index'],
                    'class_2': hypersphere_2['class'],
                    'overlap_percentage': overlap_percentage
                })

    # Convert overlap results to DataFrame
    overlap_df = pd.DataFrame(overlap_results)
    
    # Save overlap results to CSV
    overlap_df.to_csv(overlap_filename, index=False)
    
    return overlap_df


In [1]:
def sort_overlap_csv(input_csv_path, output_csv_path):
    # Read the input CSV file
    df = pd.read_csv(input_csv_path)
    
    # Sort the DataFrame by 'overlap_percentage' column
    sorted_df = df.sort_values(by='overlap_percentage', ascending=True)
    
    # Save the sorted DataFrame to the output CSV file
    sorted_df.to_csv(output_csv_path, index=False)
    
    print(f"Sorted data saved to {output_csv_path}")


In [8]:
# Load class_num_name_df
class_num_name_df = pd.read_csv(CLASS_NUM_NAME_PATH)

# Run the Cosine function to find midpoints and furthest distances
midpoints_df = cosine_find_midpoint_and_furthest_distance_per_person(embeddings, BASE_PATH, class_num_name_df)

# Define paths for midpoints CSV and overlap output directory
midpoints_csv_path = os.path.join(BASE_PATH, "Cosine/Cosine_Midpoints_Per_Person_With_Class_Names.csv")
overlap_output_base_path = os.path.join(BASE_PATH, "Cosine")

Calculating Midpoints for Each Person: 100%|██████████| 390/390 [00:00<00:00, 663.72it/s]


In [3]:
# Paths for the input CSV and output CSV
input_csv_path = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/00hypersphere_Comparison/results/Cosine/Hypersphere_Overlap_Per_Person.csv"
output_csv_path = "/home/nmichelotti/Desktop/Embeddings/embeddings_for_n8/00hypersphere_Comparison/results/Cosine/Sorted_Hypersphere_Overlap_Per_Person.csv"

# Sort the CSV by overlap percentage and save to a new file
sort_overlap_csv(input_csv_path, output_csv_path)

# Print the first few rows of the sorted result to verify
sorted_df = pd.read_csv(output_csv_path)

NameError: name 'create_hyperspheres_and_calculate_overlap_from_csv' is not defined