In [2]:
from test_fair_clustering import main
import argparse
import os
import csv
import numpy as np

In [7]:
OUTPUT_FOLDER = "outputs"
CSV_NAME = "results.csv"

lambdas = {
    "Synthetic": {
        "kmedian": 10,
        "kmeans": 10,
        "ncut": 10,
    }, 
    "Synthetic-unequal": {
        "kmedian": 10,
        "kmeans": 10,
        "ncut": 10,
    }, 
    "Adult": {
        # "kmedian": 9000,
        # "kmeans": 9000,
        "ncut": 10,
    }, 
    "Bank": {
        # "kmedian": 9000,
        # "kmeans": 6000,
        "ncut": 40,
    }, 
    "CensusII": {
        # "kmedian": 500000,
        # "kmeans": 500000,
        "ncut": 100,
    }
}

n_runs = {
    "Synthetic": 30, 
    "Synthetic-unequal": 30, 
    "Adult": 3,
    "Bank": 3, 
    "CensusII": 3,
}

In [8]:
def get_args(seed=1, dataset="Synthetic-unequal", cluster_option="ncut", lmbda=10):
    args = argparse.Namespace()
    
    args.plot_option_clusters_vs_lambda = True
    args.plot_option_fairness_vs_clusterE = False
    args.plot_option_balance_vs_clusterE = False
    args.plot_option_convergence = False
    args.lmbda_tune = False

    args.seed = seed
    args.dataset = dataset
    args.cluster_option = cluster_option
    args.lmbda = lmbda    

    working_dir = os.getcwd()
    args.data_dir = os.path.join(working_dir, "data")
    args.output_path = os.path.join(working_dir, OUTPUT_FOLDER)
    return args

def make_csv(dir_path, csv_path, fieldnames):
    os.makedirs(dir_path, exist_ok=True)
    if os.path.isfile(csv_path):
        with open(csv_path, "r") as f:
            reader = csv.reader(f)
            if len([row for row in reader]) > 0:
                return

    with open(csv_path, "w", newline='') as f:
        writer = csv.DictWriter(f, fieldnames)
        writer.writeheader()

def run_main(args, csv_name=CSV_NAME):
    results = main(args, logging=False, seedable=True)

    save_dict = {
        "dataset": args.dataset,
        "N": results['N'],
        "J": results['J'],
        "lmbda": args.lmbda,
        "Objective": results["clustering energy (Objective)"],
        "fairness error": results["fairness error"],
        "balance": results["balance"],
        "cluster_option": args.cluster_option,
        "time": results["time"],
        "seed": args.seed,
        "lmbda_tune": args.lmbda_tune,
        "K": results['K'],        
    }

    csv_path = os.path.join(args.output_path, csv_name)
    fieldnames = save_dict.keys()
    make_csv(args.output_path, csv_path, fieldnames)
    with open(csv_path, "a", newline='') as f:
        writer = csv.DictWriter(f, fieldnames)
        writer.writerow(save_dict)
    

In [9]:
def compare_entry(args, row):
    for key in ["dataset", "lmbda", "cluster_option", "lmbda_tune"]:
        if str(getattr(args, key)) != row[key]:
            return False
    return True

def find_same_options(csv_name, args):
    entries = []
    csv_path = os.path.join(args.output_path, csv_name)
    with open(csv_path, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if compare_entry(args, row):
                entries.append(row)
    return entries

In [12]:
for dataset in lambdas:
    for cluster_option in lambdas[dataset]:
        lmbda = lambdas[dataset][cluster_option]

        args = get_args(dataset=dataset, cluster_option=cluster_option, lmbda=lmbda)
        existing_entries = find_same_options(CSV_NAME, args)
        n = n_runs[dataset] - len(existing_entries)

        if n < 1:
            print("enough results for these settings")
            continue

        seeds = [int(entry["seed"]) for entry in existing_entries]
        seeds.append(0)
        next_seed = max(seeds) + 1
        for seed in range(next_seed, next_seed + n):
            args.seed = seed
            run_main(args, CSV_NAME)

enough results for these settings
enough results for these settings
enough results for these settings
enough results for these settings
enough results for these settings
enough results for these settings
enough results for these settings
Seed: 3
Cluster number for dataset Bank is 10
Balance of the dataset 0.18501283697047496
Number of points in the dataset 41108
Demographic-probabilites: [0.11219227400992507, 0.2814050793032986, 0.6064026466867763]
Demographic-numbers per group: [4612, 11568, 24928]
Generating initial seeds
Inside Lambda  40
Inside Bound Update . . .
[DONE]               

 Elapsed Time in bound_update 225.39225559999977
fairness_error = 0.5969
compute energy
fair clustering energy = 370.8274530339004
clustering energy = 0.5871415387034595
Inside ncut update
Inside Bound Update . . .
[DONE]               

 Elapsed Time in bound_update 235.13479619999998
fairness_error = 0.5980
compute energy
fair clustering energy = 370.8210300731908
clustering energy = 0.550901427123

In [13]:
# Fetch results
for dataset in lambdas:
    print(f"\n\n{dataset}")
    for cluster_option in lambdas[dataset]:
        print("\n"+cluster_option.upper())
        lmbda = lambdas[dataset][cluster_option]

        args = get_args(dataset=dataset, cluster_option=cluster_option, lmbda=lmbda)
        existing_entries = find_same_options(CSV_NAME, args)
        
        if len(existing_entries) < 1:
            print("no data yet")
            continue

        entry = existing_entries[0]
        # name = f"{dataset} (N = {entry['N']}, J = {entry['J']}, lmbda = {lmbda})"

        keys = ["Objective", "fairness error", "balance"]
        for key in keys:
            data = [float(entry[key]) for entry in existing_entries]
            mean = np.mean(data)
            std = np.std(data)

            print(f"{key}{' '*(20-len(key))} M = {mean:.2f}     SD = {std:.2f}")



Synthetic

KMEDIAN
Objective            M = 289.08     SD = 2.03
fairness error       M = 0.82     SD = 1.05
balance              M = 0.34     SD = 0.21

KMEANS
Objective            M = 203.66     SD = 2.55
fairness error       M = 2.43     SD = 1.47
balance              M = 0.27     SD = 0.44

NCUT
Objective            M = 0.20     SD = 0.10
fairness error       M = 0.00     SD = 0.00
balance              M = 0.99     SD = 0.01


Synthetic-unequal

KMEDIAN
Objective            M = 174.82     SD = 0.00
fairness error       M = 0.00     SD = 0.00
balance              M = 0.33     SD = 0.00

KMEANS
Objective            M = 169.15     SD = 28.20
fairness error       M = 0.25     SD = 0.74
balance              M = 0.30     SD = 0.10

NCUT
Objective            M = 0.02     SD = 0.03
fairness error       M = 0.00     SD = 0.00
balance              M = 0.32     SD = 0.01


Adult

NCUT
Objective            M = 0.70     SD = 0.01
fairness error       M = 0.22     SD = 0.01
balance            

In [None]:
# run_main(get_args())