<b>Computation study of the ICML submission "Fair and Fast k-Center Clustering for Data Summarization"<\b>

In [None]:
import numpy as np
from ff_k_center import FFKCenter
import time

choose the data set (representative intervals are choosen proportionally to the color-class size):

In [7]:
#(file_name, k, rep_intervals) = ('countQuery', 7, [(1,1)]*7) # within seconds
#(file_name, k, rep_intervals) = ('genderDiabetes', 12, [(6,6),(6,6)]) # within minutes
#(file_name, k, rep_intervals) = ('raceAdult', 500, [(5,5),(17,17),(49,49),(5,5),(424,424)]) # takes multiple hours
(file_name, k, rep_intervals) = ('subElectric', 32, [(1,1)]*32) # within hours


load data:

In [8]:
data = np.load("data/"+file_name+".npz",allow_pickle=True)
X = data['x']
Y = data['y']
num_colors = Y.max()+1
print("Number of points: ", len(X), "dimensions:", len(X[0]), "number of colors:", num_colors)

Number of points:  2049280 dimensions: 4 number of colors: 32


set up privacy bound range:

In [None]:
resulution = 30 # choose number of computations here
priv_start = 1
priv_end = (int) ((len(X)-1)/k) # maximal feasible privacy bound
step_size = max(1,(int) (priv_end/resulution))
privacy_bound_range = range(priv_start, priv_end + 1,step_size)

execute algorithm:

In [None]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# for a speed-up the last makePrivate execution can be disabled; the result might be worse, but the performance guarantee is still given.
makePrivateRerun = True 
#makePrivateRerun = False 

# for plot:
our_radii = []

# init model:
our = FFKCenter(k,priv_start,rep_intervals)
our.insert(X, Y)

# run with differnt privacy bounds:
for priv_bound in privacy_bound_range:

    # execute our algorithm:
    our.privacy_bound = priv_bound
    our.compute_clustering(verbose = 0, phase_2_rerun = makePrivateRerun)

    # print results:
    print("Privacy bound:", priv_bound)
    print("Model running time:", our.running_time)
    print("Our radius:", our.radius, "Number of Centers:", len(our.centers))

    # for plot:
    our_radii.append(our.radius)

plot results:

In [None]:
import matplotlib.pyplot as plt
plt.plot(privacy_bound_range,our_radii,label = "our", color = "green")
plt.xlabel('privacy_bound')
plt.ylabel('radius')
plt.legend()
plt.title("Radii for different privacy bounds")
plt.show()