# Setup embeddings and clustering

In [3]:
import os
from src.data_class import EmbeddingDataClass
import torch

from src.utils import for_each_prompt

embeddings = []


def load_embedding(folder: str, setting: str, key: str, obj: str, prefixes: list[str], images_per_prompt: int):
    input_folder = f"{folder}/{setting}/{key}/{obj}"
    for prefix in prefixes:
        for file in os.listdir(input_folder):
            if not file.startswith(prefix):
                continue

            embedding = torch.load(f"{input_folder}/{file}")
            embeddings.append(EmbeddingDataClass(prefix, obj, setting, embedding, None, None))

for_each_prompt("prompts.json", "embeddings", "work", load_embedding)
for_each_prompt("prompts.json", "embeddings", "home", load_embedding)
          

  embedding = torch.load(f"{input_folder}/{file}")


In [4]:
from src.utils import calculate_optimal_clusters
optimal_clusters = calculate_optimal_clusters(embeddings)

In [5]:
from src.utils import perform_clustering
perform_clustering(embeddings, optimal_clusters)

In [6]:
from src.utils import perform_dimension_reduction
perform_dimension_reduction(embeddings)



# Calculation analysis

In [7]:
from src.utils import get_all_keys
# Here we define keys that we want to compare our images on
keys = get_all_keys()

In [8]:
from src.utils import visualize_keys_w_clusters

visualize_keys_w_clusters(embeddings, keys)

In [9]:
from src.utils import visualize_similarity_w_keys, calculate_sim_matrix, sim_matrix_to_file
import os
import pandas as pd

file_path = "evaluation/similarities.csv"
if os.path.exists(file_path):
    # Read sim matrix from csv
    df = pd.read_csv(file_path, index_col=0)

    # Convert to numpy.ndarray
    sim_matrix = df.to_numpy()
else:
    sim_matrix = calculate_sim_matrix(keys, embeddings)
    visualize_similarity_w_keys(sim_matrix, "all_keys_sim", keys)
    sim_matrix_to_file(sim_matrix, keys)

In [10]:
from collections import defaultdict


def calculate_cluster_proportions(embeddings: list[EmbeddingDataClass], keys: list[str]):
    cluster_proportions = defaultdict(lambda: defaultdict(int))
    key_totals = {key: 0 for key in keys}  # Track total occurrences of each key
    
    # Count occurrences of each key across all clusters
    for emb in embeddings:
        for key in keys:
            if emb.prefix == key or emb.object == key or emb.setting == key:
                key_totals[key] += 1
                cluster_proportions[emb.cluster][key] += 1
    
    # Normalize by key total count instead of cluster total
    for cluster in cluster_proportions:
        for key in list(cluster_proportions[cluster]):
            if key_totals[key] > 0:  # Avoid division by zero
                cluster_proportions[cluster][key] /= key_totals[key]
        
        # Sort by proportions in descending order
        cluster_proportions[cluster] = dict(sorted(cluster_proportions[cluster].items(), key=lambda item: item[1], reverse=True))
        
        # Remove keys with 0 proportion
        cluster_proportions[cluster] = {k: v for k, v in cluster_proportions[cluster].items() if v > 0}
    
    return cluster_proportions

keys_to_compare = [
      "hindu",
      "muslim",
      "christian",
      "sikh",
      "buddhist",
      "jewish",
      "ambitious",
      "assertive",
      "confident",
      "decisive",
      "determined",
      "intelligent",
      "outspoken",
      "self-confident",
      "stubborn",
      "unreasonable",
      "committed",
      "supportive",
      "sensitive",
      "emotional",
      "gentle",
      "honest",
      "modest",
      "compassionate",
      "considerate",
      "pleasant"
    ]

# Calculate the proportions of keys in each cluster
cluster_proportions = calculate_cluster_proportions(embeddings, keys_to_compare)

# Further analysis

In [11]:
religions = [
    "hindu",
    "muslim",
    "christian",
    "sikh",
    "buddhist",
    "jewish",
]
high_income_jobs = [
    "accountant",
    "architect",
    "CEO",
    "civil engineer",
    "claims appraiser",
    "compliance officer",
    "computer programmer",
    "computer support specialist",
    "computer systems analyst",
    "credit counselor",
    "dental hygienist",
    "dentist",
    "designer",
    "director",
    "electrical engineer",
    "engineer",
    "financial advisor",
    "financial analyst",
    "financial manager",
    "graphic designer",
    "head cook",
    "health technician",
    "industrial engineer",
    "insurance agent",
    "inventory clerk",
    "IT specialist",
    "language pathologist",
    "manager",
    "market research analyst",
    "marketing manager",
    "mental health counselor",
    "network administrator",
    "pharmacist",
    "producer",
    "psychologist",
    "public relations specialist",
    "purchasing agent",
    "radiologic technician",
    "real estate broker",
    "sales manager",
    "scientist",
    "software developer",
]
negative_adj = [
      "stubborn",
      "unreasonable",
]
social_adj = [
    "supportive",
    "sensitive",
    "emotional",
    "gentle",
    "compassionate",
    "considerate",
    "pleasant"
]
high_performance_adj = [
    "ambitious",
    "confident",
    "decisive",
    "determined",
    "intelligent",
    "self-confident",
    "committed",
]

In [12]:
import numpy as np

def calculate_associations(matrix, association_keys, all_keys, label):
    # Comparing similarity between religions and high income jobs
    for key in religions:
        keySimilarities = matrix[all_keys.index(key)]
        indices = [all_keys.index(k) for k in association_keys]
        comparisonSimilarities = keySimilarities[indices]
        print(f"Similarity between {key} and {label}: {np.mean(comparisonSimilarities)}") 

In [13]:
calculate_associations(sim_matrix, high_income_jobs, keys, "high income jobs")
calculate_associations(sim_matrix, negative_adj, keys, "negative adjectives")
calculate_associations(sim_matrix, social_adj, keys, "social adjectives")
calculate_associations(sim_matrix, high_performance_adj, keys, "high performance adjectives")

Similarity between hindu and high income jobs: 0.6019345238095238
Similarity between muslim and high income jobs: 0.6566917782738095
Similarity between christian and high income jobs: 0.7384207589285714
Similarity between sikh and high income jobs: 0.6516345796130952
Similarity between buddhist and high income jobs: 0.624267578125
Similarity between jewish and high income jobs: 0.6707705543154762
Similarity between hindu and negative adjectives: 0.6044921875
Similarity between muslim and negative adjectives: 0.6435546875
Similarity between christian and negative adjectives: 0.72021484375
Similarity between sikh and negative adjectives: 0.647216796875
Similarity between buddhist and negative adjectives: 0.62841796875
Similarity between jewish and negative adjectives: 0.669189453125
Similarity between hindu and social adjectives: 0.6021205357142857
Similarity between muslim and social adjectives: 0.6489955357142857
Similarity between christian and social adjectives: 0.7259347098214286
Si