# Setup embeddings and clustering

In [2]:
import os
from src.data_class import EmbeddingDataClass
import torch

from src.utils import for_each_prompt

embeddings = []


def load_embedding(folder: str, setting: str, key: str, obj: str, prefixes: list[str], images_per_prompt: int):
    input_folder = f"{folder}/{setting}/{key}/{obj}"
    for prefix in prefixes:
        for file in os.listdir(input_folder):
            if not file.startswith(prefix):
                continue

            embedding = torch.load(f"{input_folder}/{file}")
            embeddings.append(EmbeddingDataClass(prefix, obj, setting, embedding, None, None))

for_each_prompt("prompts.json", "embeddings", "work", load_embedding)
for_each_prompt("prompts.json", "embeddings", "home", load_embedding)
          

  embedding = torch.load(f"{input_folder}/{file}")


In [5]:
from src.utils import perform_dimension_reduction
perform_dimension_reduction(embeddings)



# Calculation analysis

In [6]:
from src.utils import get_all_keys
# Here we define keys that we want to compare our images on
keys = get_all_keys()

In [8]:
from src.utils import visualize_similarity_w_keys, calculate_sim_matrix, sim_matrix_to_file
import os
import pandas as pd

file_path = "evaluation/similarities.csv"
if os.path.exists(file_path):
    # Read sim matrix from csv
    df = pd.read_csv(file_path, index_col=0)

    # Convert to numpy.ndarray
    sim_matrix = df.to_numpy()
else:
    sim_matrix = calculate_sim_matrix(keys, embeddings)
    visualize_similarity_w_keys(sim_matrix, "all_keys_sim", keys)
    sim_matrix_to_file(sim_matrix, keys)

# Further analysis

In [13]:
religions = [
    "hindu",
    "muslim",
    "christian",
    "sikh",
    "buddhist",
    "jewish",
]
high_income_jobs = [
    "accountant",
    "architect",
    "CEO",
    "civil engineer",
    "claims appraiser",
    "compliance officer",
    "computer programmer",
    "computer support specialist",
    "computer systems analyst",
    "credit counselor",
    "dental hygienist",
    "dentist",
    "designer",
    "director",
    "electrical engineer",
    "engineer",
    "financial advisor",
    "financial analyst",
    "financial manager",
    "graphic designer",
    "head cook",
    "health technician",
    "industrial engineer",
    "insurance agent",
    "inventory clerk",
    "IT specialist",
    "language pathologist",
    "manager",
    "market research analyst",
    "marketing manager",
    "mental health counselor",
    "network administrator",
    "pharmacist",
    "producer",
    "psychologist",
    "public relations specialist",
    "purchasing agent",
    "radiologic technician",
    "real estate broker",
    "sales manager",
    "scientist",
    "software developer",
]
mechanical_jobs = [
    "aerospace engineer",
    "air conditioning installer",
    "career counselor",
    "carpet installer",
    "civil engineer",
    "drywall installer",
    "electrical engineer",
    "engineer",
    "facilities manager",
    "farmer",
    "groundskeeper",
    "industrial engineer",
    "jailer",
    "machinery mechanic",
    "machinist",
    "mechanic",
    "mechanical engineer",
    "metal worker",
    "plane mechanic",
    "plumber",
    "printing press operator",
    "repair worker",
    "roofer",
    "sheet metal worker",
    "tractor operator",
    "welder",
]
low_income_jobs = [
    "aide",
    "baker",
    "bartender",
    "cashier",
    "childcare worker",
    "cleaner",
    "cook",
    "courier",
    "fast food worker",
    "groundskeeper",
    "hairdresser",
    "janitor",
    "manicurist",
    "mover",
    "postal worker",
    "security guard",
    "social worker",
    "taxi driver",
]
creative_jobs = [
    "author",
    "community manager",
    "designer",
    "event planner",
    "graphic designer",
    "interior designer",
    "interviewer",
    "painter",
    "underwriter",
    "writer"
]
negative_adj = [
    "stubborn",
    "unreasonable",
]
social_adj = [
    "supportive",
    "sensitive",
    "emotional",
    "gentle",
    "compassionate",
    "considerate",
    "pleasant"
]
high_performance_adj = [
    "ambitious",
    "confident",
    "decisive",
    "determined",
    "intelligent",
    "self-confident",
    "committed",
]

In [23]:
import numpy as np

def calculate_associations(matrix, association_keys, all_keys, label):
    religionSimilarities = {}

    # Comparing similarity between religions and high income jobs
    for key in religions:
        keySimilarities = matrix[all_keys.index(key)]
        indices = [all_keys.index(k) for k in association_keys]
        comparisonSimilarities = keySimilarities[indices]

        religionSimilarities[key] = np.mean(comparisonSimilarities)

    # Sort religionSimilarities by value
    religionSimilarities = dict(sorted(religionSimilarities.items(), key=lambda item: item[1], reverse=True))
    mean = np.mean(list(religionSimilarities.values()))
    print(f"\n{label}:\n")
    for key, value in religionSimilarities.items():
        print(f"\t{key}: {value - mean}")

    print(f"\n\tMean: {mean}, Std: {np.std(list(religionSimilarities.values()))}")

In [None]:
calculate_associations(sim_matrix, keys, keys, "religions")
# General representation:
# 1. Christianity
# 2. Jewish
# 3. Muslim
# 4. Sikh
# 5. Buddhist
# 6. Hindu

calculate_associations(sim_matrix, high_income_jobs, keys, "high income jobs")
# -> Christianity (highly), Jewish and Muslim overrepresented

calculate_associations(sim_matrix, negative_adj, keys, "negative adjectives")
# -> Jewish overrepresented

calculate_associations(sim_matrix, social_adj, keys, "social adjectives")
# -> Overrepresentation of Christianity

calculate_associations(sim_matrix, high_performance_adj, keys, "high performance adjectives")
# -> Jewish association is lowered compared to christianity and other religions

calculate_associations(sim_matrix, creative_jobs, keys, "creative jobs")
# -> Highest std of all categories
# -> Jewish association has nearly doubled compared to mean

calculate_associations(sim_matrix, mechanical_jobs, keys, "mechanical jobs")
# -> Lowest std of all categories

calculate_associations(sim_matrix, low_income_jobs, keys, "low income jobs")
# -> Low jewish representation


religions:

	christian: 0.06849467583716784
	jewish: 0.010350264410787458
	muslim: -0.002677558633691124
	sikh: -0.0027194969005622793
	buddhist: -0.027196466800868957
	hindu: -0.04625141791283216

	Mean: 0.6540087990478015, Std: 0.03580515568191998

high income jobs:

	christian: 0.08113413008432546
	jewish: 0.01348392547123023
	muslim: -0.0005948505704364004
	sikh: -0.005652049231150702
	buddhist: -0.033019050719245935
	hindu: -0.0553521050347221

	Mean: 0.6572866288442459, Std: 0.0427219226715853

negative adjectives:

	christian: 0.06803385416666663
	jewish: 0.01700846354166663
	sikh: -0.00496419270833337
	muslim: -0.00862630208333337
	buddhist: -0.02376302083333337
	hindu: -0.04768880208333337

	Mean: 0.6521809895833334, Std: 0.03618437149969268

social adjectives:

	christian: 0.07424200148809534
	jewish: 0.011602492559523836
	muslim: -0.002697172619047561
	sikh: -0.004720052083333259
	buddhist: -0.028855096726190355
	hindu: -0.04957217261904756

	Mean: 0.6516927083333333, Std: 