In [1]:
import numpy as np
import pandas as pd
import math

import src.hdmm.workload as workload
import src.census_workloads as census
from src.workload_selection import workload_selection
import online_workloads as online_workloads

# Database

In [2]:
data_path = "migration_tworace.csv"
x_data = pd.read_csv(data_path, header=None).to_numpy().T[1]
n = x_data.shape[0]
n

86

# Workloads

In [3]:
W_name = ['identity', 'total', 'race1', 'race2', 'race3', 'custom', 'prefix_sum']
W_lst = [online_workloads.identity(n), online_workloads.total(n), online_workloads.race1(), online_workloads.race2(), online_workloads.race3(), online_workloads.custom(n), online_workloads.prefix_sum(n)]

def print_workload(workload_name, workload):
    print(f'---Workload: {workload_name}---')
    print(f'Shape: {workload.shape}')
    print(f'Workload: \n{workload}\n')
    
def print_workloads():
    for i in range(7):
        print_workload(W_name[i], W_lst[i])
        
print_workloads()


---Workload: identity---
Shape: (86, 86)
Workload: 
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]

---Workload: total---
Shape: (1, 86)
Workload: 
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]

---Workload: race1---
Shape: (7, 64)
Workload: 
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.

In [4]:
len(online_workloads.identity(n))

86

In [5]:
x_data

array([412, 333, 285, 231, 202, 174, 160, 142, 146, 149, 145, 181, 174,
       190, 213, 287, 372, 499, 619, 715, 785, 821, 822, 816, 799, 742,
       717, 697, 658, 593, 564, 519, 447, 403, 388, 365, 336, 306, 311,
       289, 261, 231, 213, 196, 194, 170, 175, 168, 149, 142, 131, 119,
       112, 118, 114, 116, 112, 114, 106, 111, 109, 112, 113, 109, 104,
       108, 108,  94,  91,  81,  81,  72,  68,  63,  56,  46,  41,  38,
        34,  28,  23,  22,  18,  18,  16,  41])

It seems to update too much during the update steps (step 0-5). Using the example database of [1, 2, 3, 4, 5] and a workload of two stacked 5x5 identity matrices, during the first 5 queries consisting of the first identity matrix, the database updates the first few vectors way too much. 

In [131]:
# pmw_naive
# write version of PMW where analysts can run out of privacy budget if they use too much of others' budgets
# this version everyone shares the same number of update steps 
def new_pmw_naive(workload, x, analyst_labels, T, eps=0.01, total_k=None, 
         show_messages=False, to_return='pd', show_plot=False, show_failure_step=False, eta = None):
    """
    Implement Private Multiplicative Weights Mechanism (PMW) on a workload of
    linear queries where analysts can run out of privacy budget if they use too much of others'. 
    
    In other words, 

    Algorithm Parameters: 
    - workload = workload of queries (M x k numpy array)
    - x = true database (M x 1 numpy array)
    - T = update threshold
    - eps = privacy budget
    - total_k = total number of update steps alloted for the entire group
    - analyst_labels = list of analyst names corresponding to each query in the workload
    
    Output Controls: 
    - show_messages argument determines whether the function will print information such as 
    error scale, threshold, update steps used, etc.
    - to_return argument determines what the function will return. 
        - if 'pd', pmw() returns pandas df with test data for each 
        query in the workload(showing query, d_t_hat, updated, algo_ans, real_ans, 
        abs_error, rel_error). 
        - if 'update_count', pmw() returns the update count for the total
        amount of queries.
    - show_plot - T/F whether the function will display a plot
    - show_failure_step - T/F whether function prints what step failure mode is reached
    """ 
    
    # initialize constants
    m = x.size  # database len
    n = x.sum()
    if(eta == None):
        eta = (math.log(m, np.e) / ((math.sqrt(n))) )
    #print(eta)
    delta = 1 / (n * math.log(n, np.e))
    x_norm = x / np.sum(x)
    
    # initialize synthetic databases at time 0 (prior to any queries)
    x_t = np.ones(m) / m
    y_t = np.ones(m) / m

    # initialize tracker lists to construct pandas dataframe at the end 
    x_list = [x_t] # create a list of x_t synthetic database at every time step
    update_list = []
    update_count = 0
    pmw_answers = []
    update_times = [] # record times that database is updated
    d_t_hat_list = []
    
    # initialize total_k, the total number of update steps if not default
    if total_k == None:
        total_k = round(n * math.log(math.sqrt(m))/770)
        print(f'{total_k=}')
    
    def lazy_round():
        """
        "Lazy Round" of querying using the stored synthetic database, x_t, in list x_list.
        
        We call this the lazy round because it is contrasted with the updated step where we update the 
        sythetic database and answer the query using the real database.
        """
        update_list.append('no')
        answer = np.dot(query, x_list[time])
        if answer < 0:
            pmw_answers.append(0)
        else: 
            pmw_answers.append(answer)
        x_list.append(x_list[time].round(3))
    
    # inititate first instance of SVT with half the budget and k updates; will be reset in the main loop
    SVTtrigger = False 
    SVTepsilon1 = ((eps/2)/2)
    SVTepsilon2 = ((eps/2)/2)
    rho = np.random.laplace(loc=0, scale=(1/SVTepsilon1), size=1)[0]
    #print(rho + T)
    
    
    for time, query in enumerate(workload):
        
        analyst = analyst_labels[time]
        
        # Do one round of sparse vector technique; compute noisy answer by adding Laplacian noise
        A_t = np.random.laplace(loc=0, scale=(total_k/SVTepsilon2), size=1)[0]
        a_t_hat = (np.dot(query, x_norm)*n ) + A_t
        d_t_hat = a_t_hat - (n*np.dot(query, x_list[time]))
        
        # LAZY ROUND: QUERY USING THE SYNTHETIC DATABASE
        if (abs(d_t_hat) <= T + rho):
            d_t_hat_list.append(d_t_hat)
            lazy_round()

        # UPDATE ROUND: UPDATE SYNTHETIC DATABASE AND RETURN NOISY ANSWER, A_T-HAT
        else:
            # noise
            A_t = np.random.laplace(loc=0, scale=(2*total_k/eps), size=1)[0]
            
            # noisy answer
            a_t_hat = (np.dot(query, x_norm)*n ) + A_t
            d_t_hat = a_t_hat - (n*np.dot(query, x_list[time]))
            d_t_hat_list.append(d_t_hat)
            update_times.append(time)
            
            # step a
            if d_t_hat < 0:
                r_t = query
            else:
                r_t = np.ones(m) - query
            for i in range(len(y_t)):
                y_t[i] = x_list[time][i] * math.exp(-( eta * r_t[i]))# eta is the learning rate
            
            # step b
            x_t = y_t / np.sum(y_t)
            update_count = update_list.count('yes')
            
            # if threshold for num updates is reached, just do a lazy round (synthetic database) answer
            if total_k == 0: 
                if show_failure_step:
                    print(f'Failure mode reached at query number {time}: {query}')
                lazy_round()
                
            # if there are still update steps that the analyst can use, 
            # 1. update the synthetic database
            # 2. answer the query using the noisy answer from the database itself 
            else: 
                x_list.append(x_t.round(3))
                update_list.append('yes') # increment number of updates counter
                answer = a_t_hat / np.sum(x)
                
                if answer < 0:
                    pmw_answers.append(0)
                else: 
                    pmw_answers.append(answer)
                
                total_k -= 1 # use one of the total update steps
        
        #print(f'{x_list[time] - x_list[time - 1]=}')
        
        
    update_count = update_list.count('yes')      

    # calculate error
    real_ans = np.matmul(workload, x_norm)
    abs_error = np.abs(pmw_answers - real_ans)
    rel_error = np.abs(abs_error / np.where(real_ans == 0, 0.000001,
                                                real_ans))
    
    if show_messages:
        np.set_printoptions(suppress=True)
        """Print inputes/outputs to analyze each query"""
        print(f'Original database: {x}\n')
        print(f'Normalized database: {x_norm}\n')
        print(f'Synthetic Database (before) = {x_list[0]}\n')
        print(f'Synthetic Database (after) = {x_list[len(x_list) - 1]}\n')
        print(f'Difference btw. Final Synthetic and true database = {x_list[len(x_list) - 1] - x_norm}\n')
        print(f'Update Count = {update_count}\n')
        print(f'{T=}\n')
        print(f'Error Scale Query Answer= {2*((2*total_k/eps)**2)}\n')
        print(f'Error Scale SVT= {2*((2*total_k/SVTepsilon2)**2)}\n')
        print(f'Update Parameter Scale = {eta}\n')
        print(f'{delta=}\n')
        
    if show_plot: 
        plt.title('Error across queries:')
        rel_line, = plt.plot(rel_error, label='Relative Error')
        abs_line, = plt.plot(abs_error, label='Absolute Error')
        for xc in update_times:
            plt.axvline(x=xc, color='red', label='Update Times', linestyle='dashed')
        plt.legend(handles=[abs_line,rel_line])
        plt.xticks(range(0, len(workload), round(len(workload)/5)))
    
    if to_return == "pd":
        # hacky fix: remove the first synthetic database to keep length of lists consistent with the
        # other lists that comprise of the pandas dataframe
        x_list.pop(0).tolist() 
        d = {
            'algo_ans': pmw_answers,
            'real_ans': real_ans.tolist(),
            'queries': workload.tolist(), 
            'updated': update_list,
            'abs_error': abs_error,               
            'rel_error': rel_error,
            'synthetic database': x_list,
            'analyst': analyst_labels,
            'd_t_hat': d_t_hat_list, 

             }
        test_data = pd.DataFrame(data=d)
        #test_data = test_data.round(3)
        return test_data
    
    if to_return == "error":
        d = {'analyst': analyst_labels,
             'abs_error': abs_error,               
             'rel_error': rel_error,}
        data = pd.DataFrame(data=d)
        data = data.round(3)
        
        analyst_error = {}
        for analyst in list(sorted(analyst_labels)):
            analyst_error[analyst] = data[data.analyst==analyst]['abs_error'].sum()
        return analyst_error

In [114]:
tenq = np.vstack((online_workloads.identity(5), online_workloads.identity(5)))
fiftyq = np.vstack((tenq, 
                   tenq, 
                   tenq, 
                   tenq, 
                   tenq))

In [8]:
# initialize databases

import pandas as pd
data_path = "migration_tworace.csv"
x_race = pd.read_csv(data_path, header=None).iloc[:, 1].to_numpy()
n = x_race.shape[0]
x_race

x_example = np.array([1000, 2000, 3000, 4000, 5000])
new_x = np.array([.1, .15, .2, .25, .3]) * 1500

In [9]:
x_example

array([1000, 2000, 3000, 4000, 5000])

In [10]:
new_x

array([150., 225., 300., 375., 450.])

In [132]:
ten_identity_q = np.vstack((online_workloads.identity(10), 
                            online_workloads.identity(10),
                            online_workloads.identity(10),
                            online_workloads.identity(10),
                            online_workloads.identity(10),
                            online_workloads.identity(10),
                            online_workloads.identity(10),
                            online_workloads.identity(10),
                            online_workloads.identity(10),
                            online_workloads.identity(10)))

hundred_identity_q = np.vstack((ten_identity_q, 
                               ten_identity_q, 
                               ten_identity_q, 
                               ten_identity_q, 
                               ten_identity_q, 
                               ten_identity_q, 
                               ten_identity_q, 
                               ten_identity_q, 
                               ten_identity_q, 
                               ten_identity_q))

print(f'{ten_identity_q.shape=}')
print(f'{hundred_identity_q.shape=}')

ten_identity_q.shape=(100, 10)
hundred_identity_q.shape=(1000, 10)


In [133]:
x_race_first_ten = x_race[:10]

# try identity on the race database
new_pmw_naive(hundred_identity_q,
     x_race_first_ten, ['A'] * 1000, eps=1, T=40, total_k = 100, show_messages=True)

Original database: [412 333 285 231 202 174 160 142 146 149]

Normalized database: [0.18442256 0.14905998 0.12757386 0.10340197 0.09042077 0.0778872
 0.07162041 0.06356312 0.06535363 0.06669651]

Synthetic Database (before) = [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]

Synthetic Database (after) = [0.161 0.137 0.12  0.102 0.08  0.088 0.086 0.08  0.074 0.072]

Difference btw. Final Synthetic and true database = [-0.02342256 -0.01205998 -0.00757386 -0.00140197 -0.01042077  0.0101128
  0.01437959  0.01643688  0.00864637  0.00530349]

Update Count = 100

T=40

Error Scale Query Answer= 0.0

Error Scale SVT= 0.0

Update Parameter Scale = 0.048716278470739886

delta=5.8046389258630686e-05



Unnamed: 0,algo_ans,real_ans,queries,updated,abs_error,rel_error,synthetic database,analyst,d_t_hat
0,0.430417,0.184423,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",yes,0.245995,1.333864,"[0.104, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1...",A,738.152078
1,0.150512,0.149060,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",yes,0.001452,0.009741,"[0.103, 0.104, 0.099, 0.099, 0.099, 0.099, 0.0...",A,112.843859
2,0.124722,0.127574,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",yes,0.002852,0.022354,"[0.103, 0.104, 0.104, 0.099, 0.099, 0.099, 0.0...",A,57.462981
3,0.013771,0.103402,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",yes,0.089631,0.866818,"[0.103, 0.104, 0.104, 0.094, 0.099, 0.099, 0.0...",A,-190.400847
4,0.042522,0.090421,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",yes,0.047899,0.529730,"[0.104, 0.105, 0.105, 0.095, 0.095, 0.1, 0.1, ...",A,-126.171442
...,...,...,...,...,...,...,...,...,...
995,0.088000,0.077887,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",no,0.010113,0.129839,"[0.161, 0.137, 0.12, 0.102, 0.08, 0.088, 0.086...",A,-22.592000
996,0.086000,0.071620,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",no,0.014380,0.200775,"[0.161, 0.137, 0.12, 0.102, 0.08, 0.088, 0.086...",A,-32.124000
997,0.080000,0.063563,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",no,0.016437,0.258592,"[0.161, 0.137, 0.12, 0.102, 0.08, 0.088, 0.086...",A,-36.720000
998,0.074000,0.065354,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",no,0.008646,0.132301,"[0.161, 0.137, 0.12, 0.102, 0.08, 0.088, 0.086...",A,-19.316000


In [134]:
new_pmw_naive(fiftyq, x_example, ['A'] * 50, eps=1, T=40)# , total_k = 25)

total_k=16


Unnamed: 0,algo_ans,real_ans,queries,updated,abs_error,rel_error,synthetic database,analyst,d_t_hat
0,0.070675,0.066667,"[1.0, 0.0, 0.0, 0.0, 0.0]",yes,0.004008,0.060118,"[0.198, 0.201, 0.201, 0.201, 0.201]",A,-1939.881724
1,0.133047,0.133333,"[0.0, 1.0, 0.0, 0.0, 0.0]",yes,0.000287,0.00215,"[0.198, 0.198, 0.201, 0.201, 0.201]",A,-1019.300574
2,0.201,0.2,"[0.0, 0.0, 1.0, 0.0, 0.0]",no,0.001,0.005,"[0.198, 0.198, 0.201, 0.201, 0.201]",A,4.892521
3,0.265353,0.266667,"[0.0, 0.0, 0.0, 1.0, 0.0]",yes,0.001314,0.004928,"[0.198, 0.198, 0.201, 0.203, 0.201]",A,965.289622
4,0.329103,0.333333,"[0.0, 0.0, 0.0, 0.0, 1.0]",yes,0.004231,0.012692,"[0.197, 0.197, 0.2, 0.202, 0.203]",A,1921.537759
5,0.066944,0.066667,"[1.0, 0.0, 0.0, 0.0, 0.0]",yes,0.000277,0.004156,"[0.195, 0.198, 0.201, 0.203, 0.204]",A,-1950.844284
6,0.133754,0.133333,"[0.0, 1.0, 0.0, 0.0, 0.0]",yes,0.000421,0.003158,"[0.195, 0.196, 0.201, 0.203, 0.204]",A,-963.68398
7,0.201,0.2,"[0.0, 0.0, 1.0, 0.0, 0.0]",no,0.001,0.005,"[0.195, 0.196, 0.201, 0.203, 0.204]",A,-40.657764
8,0.267618,0.266667,"[0.0, 0.0, 0.0, 1.0, 0.0]",yes,0.000951,0.003567,"[0.195, 0.196, 0.201, 0.205, 0.204]",A,969.268082
9,0.334455,0.333333,"[0.0, 0.0, 0.0, 0.0, 1.0]",yes,0.001122,0.003365,"[0.194, 0.195, 0.2, 0.204, 0.206]",A,1956.827141


In [13]:
# pmw_split
def pmw2(workload, x, analyst_labels, T, eps=0.01, k=None, 
         show_messages=False, to_return='pd', show_plot=False, show_failure_step=False):
    """
    Implement Private Multiplicative Weights Mechanism (PMW) on a workload of
    linear queries. 

    Algorithm Parameters: 
    - workload = workload of queries (M x k numpy array)
    - x = true database (M x 1 numpy array)
    - T = update threshold
    - eps = privacy budget
    - k = number of update steps PER ANALYST
    - analyst_labels = list of analyst names corresponding to each query in the workload
    
    Output Controls: 
    - show_messages argument determines whether the function will print information such as 
    error scale, threshold, update steps used, etc.
    - to_return argument determines what the function will return. 
        - if 'pd', pmw() returns pandas df with test data for each 
        query in the workload(showing query, d_t_hat, updated, algo_ans, real_ans, 
        abs_error, rel_error). 
        - if 'update_count', pmw() returns the update count for the total
        amount of queries.
    - show_plot - T/F whether the function will display a plot
    - show_failure_step - T/F whether function prints what step failure mode is reached
    """ 
    
    update_steps = {}
    
    # initialize constants
    m = x.size  # database len
    n = x.sum()
    eta = (math.log(m, np.e) ** (1 / 4)) / (math.sqrt(n))
    delta = 1 / (n * math.log(n, np.e))
    x_norm = x / np.sum(x)
    
    if k == None: # if no k is specified, use the default k value split into the number of analysts
        k = round(n * math.log(math.sqrt(m))/770/len(set(analyst_labels)))
        
    for analyst in list(set(analyst_labels)): 
        update_steps[analyst] = k # each analyst starts with k update steps
    
    # initialize synthetic databases at time 0 (prior to any queries)
    x_t = np.ones(m) / m
    y_t = np.ones(m) / m

    # initialize tracker lists to construct pandas dataframe at the end 
    x_list = [x_t] # create a list of x_t synthetic database at every time step
    update_list = []
    update_count = 0
    pmw_answers = []
    update_times = [] # record times that database is updated
    d_t_hat_list = []
    
    def lazy_round():
        """
        "Lazy Round" of querying using the stored synthetic database, x_t, in list x_list.
        
        We call this the lazy round because it is contrasted with the updated step where we update the 
        sythetic database and answer the query using the real database.
        """
        update_list.append('no')
        answer = np.dot(query, x_list[time])
        if answer < 0:
            pmw_answers.append(0)
        else: 
            pmw_answers.append(answer)
        x_list.append(x_list[time].round(3))
    
    # inititate first instance of SVT with half the budget and k updates; will be reset in the main loop
    SVTtrigger = False 
    SVTepsilon1 = ((eps/2)/2)
    SVTepsilon2 = ((eps/2)/2)
    rho = np.random.laplace(loc=0, scale=(1/SVTepsilon1), size=1)[0]
    
    for time, query in enumerate(workload):
        
        analyst = analyst_labels[time]
        
        # Do one round of sparse vector technique; compute noisy answer by adding Laplacian noise
        A_t = np.random.laplace(loc=0, scale=(k/SVTepsilon2), size=1)[0]
        a_t_hat = (np.dot(query, x_norm)*n ) + A_t
        d_t_hat = a_t_hat - (n*np.dot(query, x_list[time]))
        
        # LAZY ROUND: QUERY USING THE SYNTHETIC DATABASE
        if (abs(d_t_hat) <= T + rho):
            d_t_hat_list.append(d_t_hat)
            lazy_round()

        # UPDATE ROUND: UPDATE SYNTHETIC DATABASE AND RETURN NOISY ANSWER, A_T-HAT
        else:
            # noise
            A_t = np.random.laplace(loc=0, scale=(2*k/eps), size=1)[0]
            
            # noisy answer
            a_t_hat = (np.dot(query, x_norm)*n ) + A_t
            d_t_hat = a_t_hat - (n*np.dot(query, x_list[time]))
            d_t_hat_list.append(d_t_hat)
            update_times.append(time)
            
            # step a
            if d_t_hat < 0:
                r_t = query
            else:
                r_t = np.ones(m) - query
            for i in range(len(y_t)):
                y_t[i] = x_list[time][i] * math.exp((d_t_hat/(2*n)) * query[i]) * 20 # 20 is the learning rate
            
            # step b
            x_t = y_t / np.sum(y_t)
            update_count = update_list.count('yes')
            
            # if threshold for num updates is reached, just do a lazy round (synthetic database) answer
            if update_steps[analyst] == 0: 
                if show_failure_step:
                    print(f'Failure mode reached at query number {time}: {query}')
                lazy_round()
                
            # if there are still update steps that the analyst can use, 
            # 1. update the synthetic database
            # 2. answer the query using the noisy answer from the database itself 
            else: 
                x_list.append(x_t.round(3))
                update_list.append('yes') # increment number of updates counter
                answer = a_t_hat / np.sum(x)
                
                if answer < 0:
                    pmw_answers.append(0)
                else: 
                    pmw_answers.append(answer)
                
                update_steps[analyst] -= 1 # use one of analyst's update steps

    update_count = update_list.count('yes')      

    # calculate error
    real_ans = np.matmul(workload, x_norm)
    abs_error = np.abs(pmw_answers - real_ans)
    rel_error = np.abs(abs_error / np.where(real_ans == 0, 0.000001,
                                                real_ans))
    
    if show_messages:
        np.set_printoptions(suppress=True)
        """Print inputes/outputs to analyze each query"""
        print(f'Original database: {x}\n')
        print(f'Normalized database: {x_norm}\n')
        print(f'Updated Database = {x_t}\n')
        print(f'Update Count = {update_count}\n')
        print(f'{T=}\n')
        print(f'Error Scale Query Answer= {2*((2*k/eps)**2)}\n')
        print(f'Error Scale SVT= {2*((2*k/SVTepsilon2)**2)}\n')
        print(f'Update Parameter Scale = {eta}\n')
        print(f'{delta=}\n')
        
    if show_plot: 
        plt.title('Error across queries:')
        rel_line, = plt.plot(rel_error, label='Relative Error')
        abs_line, = plt.plot(abs_error, label='Absolute Error')
        for xc in update_times:
            plt.axvline(x=xc, color='red', label='Update Times', linestyle='dashed')
        plt.legend(handles=[abs_line,rel_line])
        plt.xticks(range(0, len(workload), round(len(workload)/5)))
    
    if to_return == "pd":
        # hacky fix: remove the first synthetic database to keep length of lists consistent with the
        # other lists that comprise of the pandas dataframe
        x_list.pop(0).tolist() 
        d = {
            'algo_ans': pmw_answers,
            'real_ans': real_ans.tolist(),
            'queries': workload.tolist(), 
            'updated': update_list,
            'abs_error': abs_error,               
            'rel_error': rel_error,
            'synthetic database': x_list,
            'analyst': analyst_labels,
            'd_t_hat': d_t_hat_list, 

             }
        test_data = pd.DataFrame(data=d)
        test_data = test_data.round(3)
        return test_data
    
    if to_return == "error":
        d = {'analyst': analyst_labels,
             'abs_error': abs_error,               
             'rel_error': rel_error,}
        data = pd.DataFrame(data=d)
        data = data.round(3)
        
        analyst_error = {}
        for analyst in list(sorted(analyst_labels)):
            analyst_error[analyst] = data[data.analyst==analyst]['abs_error'].sum()
        return analyst_error
    
pmw2(np.vstack((online_workloads.identity(5), online_workloads.identity(5))), 
     x_example, ['A'] * 5 + ['B'] * 5, eps=1, T=40)# , to_return='error')


Unnamed: 0,algo_ans,real_ans,queries,updated,abs_error,rel_error,synthetic database,analyst,d_t_hat
0,0.067,0.067,"[1.0, 0.0, 0.0, 0.0, 0.0]",yes,0.0,0.0,"[0.19, 0.203, 0.203, 0.203, 0.203]",A,-2000.299
1,0.134,0.133,"[0.0, 1.0, 0.0, 0.0, 0.0]",yes,0.001,0.005,"[0.191, 0.197, 0.204, 0.204, 0.204]",A,-1035.881
2,0.204,0.2,"[0.0, 0.0, 1.0, 0.0, 0.0]",no,0.004,0.02,"[0.191, 0.197, 0.204, 0.204, 0.204]",A,-12.823
3,0.267,0.267,"[0.0, 0.0, 0.0, 1.0, 0.0]",yes,0.0,0.0,"[0.19, 0.196, 0.203, 0.209, 0.203]",A,940.655
4,0.333,0.333,"[0.0, 0.0, 0.0, 0.0, 1.0]",yes,0.0,0.0,"[0.187, 0.193, 0.2, 0.206, 0.214]",A,1953.78
5,0.066,0.067,"[1.0, 0.0, 0.0, 0.0, 0.0]",yes,0.0,0.006,"[0.178, 0.195, 0.202, 0.208, 0.216]",B,-1811.141
6,0.132,0.133,"[0.0, 1.0, 0.0, 0.0, 0.0]",yes,0.002,0.012,"[0.179, 0.19, 0.203, 0.209, 0.218]",B,-949.792
7,0.199,0.2,"[0.0, 0.0, 1.0, 0.0, 0.0]",yes,0.001,0.004,"[0.179, 0.19, 0.203, 0.209, 0.218]",B,-55.546
8,0.265,0.267,"[0.0, 0.0, 0.0, 1.0, 0.0]",yes,0.001,0.006,"[0.178, 0.189, 0.202, 0.214, 0.217]",B,842.533
9,0.333,0.333,"[0.0, 0.0, 0.0, 0.0, 1.0]",yes,0.001,0.002,"[0.176, 0.187, 0.199, 0.211, 0.227]",B,1733.063


In [14]:
# pmw_independent: write pmw for one person. 
# create wrapper function called pmw_independent() that takes in the workloads and workload labels. Run PMW for each analyst, separate their workloads based on analysts. 

def pmw_independent(w, input_x, analyst_labels, input_T, input_eps=0.01, input_k=5):
    """
    Wrapper function that calls pmw2() to simulate PMW for each independent person. 
    
    Takes a stream of workloads and analyst labels and separates them into distinct workloads for each analyst. Runs
    pmw2() on that particular workload for each analyst. Returns a dictionary o 
    """
    indices = {} # k: analyst, v: row indices of queries in the workloads
    for i, analyst in enumerate(analyst_labels):
        if analyst not in indices.keys(): 
            indices[analyst] = []
        indices[analyst].append(i)

    workloads = {} # k: analyst, v: the analyst's workload
    for analyst in indices.keys():
        workloads[analyst] = w[indices[analyst], :]
    print(workloads)

    all_analyst_error_dic = {}
    
    for analyst in workloads.keys():
        single_analyst_error = pmw2(workload=workloads[analyst], 
                                    x=input_x, 
                                    T=input_T, 
                                    k = input_k,
                                    analyst_labels=[analyst]*len(workloads[analyst]), 
                                    to_return="error",
                                    show_messages=False)
        all_analyst_error_dic.update(single_analyst_error)
    return all_analyst_error_dic
             
        
pmw_independent(np.vstack((online_workloads.identity(5), 
                           online_workloads.identity(5))), 
                input_x=x_example, 
                input_T=40, 
                input_eps=1, 
                analyst_labels=['A'] * 2 + ['B'] * 6 + ['A'] * 2, 
                )
    

{'A': array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]]), 'B': array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.]])}


{'A': 0.18900000000000003, 'B': 0.395}

In [128]:
# pmw_naive
# write version of PMW where analysts can run out of privacy budget if they use too much of others' budgets
# this version everyone shares the same number of update steps 
def pmw_naive(workload, x, analyst_labels, T, eps=0.01, total_k=None, 
         show_messages=False, to_return='pd', show_plot=False, show_failure_step=False, eta = 1):
    """
    Implement Private Multiplicative Weights Mechanism (PMW) on a workload of
    linear queries where analysts can run out of privacy budget if they use too much of others'. 
    
    In other words, 

    Algorithm Parameters: 
    - workload = workload of queries (M x k numpy array)
    - x = true database (M x 1 numpy array)
    - T = update threshold
    - eps = privacy budget
    - total_k = total number of update steps alloted for the entire group
    - analyst_labels = list of analyst names corresponding to each query in the workload
    
    Output Controls: 
    - show_messages argument determines whether the function will print information such as 
    error scale, threshold, update steps used, etc.
    - to_return argument determines what the function will return. 
        - if 'pd', pmw() returns pandas df with test data for each 
        query in the workload(showing query, d_t_hat, updated, algo_ans, real_ans, 
        abs_error, rel_error). 
        - if 'update_count', pmw() returns the update count for the total
        amount of queries.
    - show_plot - T/F whether the function will display a plot
    - show_failure_step - T/F whether function prints what step failure mode is reached
    """ 
    
    # initialize constants
    m = x.size  # database len
    n = x.sum()
    eta = (math.log(m, np.e) ** (1 / 4)) / (math.sqrt(n))
    delta = 1 / (n * math.log(n, np.e))
    x_norm = x / np.sum(x)
    
    # initialize synthetic databases at time 0 (prior to any queries)
    x_t = np.ones(m) / m
    y_t = np.ones(m) / m

    # initialize tracker lists to construct pandas dataframe at the end 
    x_list = [x_t] # create a list of x_t synthetic database at every time step
    update_list = []
    update_count = 0
    pmw_answers = []
    update_times = [] # record times that database is updated
    d_t_hat_list = []
    
    # initialize total_k, the total number of update steps if not default
    if total_k == None:
        total_k = round(n * math.log(math.sqrt(m))/770)
        print(f'{total_k=}')
    
    def lazy_round():
        """
        "Lazy Round" of querying using the stored synthetic database, x_t, in list x_list.
        
        We call this the lazy round because it is contrasted with the updated step where we update the 
        sythetic database and answer the query using the real database.
        """
        update_list.append('no')
        answer = np.dot(query, x_list[time])
        if answer < 0:
            pmw_answers.append(0)
        else: 
            pmw_answers.append(answer)
        x_list.append(x_list[time].round(3))
    
    # inititate first instance of SVT with half the budget and k updates; will be reset in the main loop
    SVTtrigger = False 
    SVTepsilon1 = ((eps/2)/2)
    SVTepsilon2 = ((eps/2)/2)
    rho = np.random.laplace(loc=0, scale=(1/SVTepsilon1), size=1)[0]
    
    for time, query in enumerate(workload):
        
        analyst = analyst_labels[time]
        
        # Do one round of sparse vector technique; compute noisy answer by adding Laplacian noise
        A_t = np.random.laplace(loc=0, scale=(total_k/SVTepsilon2), size=1)[0]
        a_t_hat = (np.dot(query, x_norm)*n ) + A_t
        d_t_hat = a_t_hat - (n*np.dot(query, x_list[time]))
        
        # LAZY ROUND: QUERY USING THE SYNTHETIC DATABASE
        if (abs(d_t_hat) <= T + rho):
            d_t_hat_list.append(d_t_hat)
            lazy_round()

        # UPDATE ROUND: UPDATE SYNTHETIC DATABASE AND RETURN NOISY ANSWER, A_T-HAT
        else:
            # noise
            A_t = np.random.laplace(loc=0, scale=(2*total_k/eps), size=1)[0]
            
            # noisy answer
            a_t_hat = (np.dot(query, x_norm)*n ) + A_t
            d_t_hat = a_t_hat - (n*np.dot(query, x_list[time]))
            d_t_hat_list.append(d_t_hat)
            update_times.append(time)
            
            # step a
            if d_t_hat < 0:
                r_t = query
            else:
                r_t = np.ones(m) - query
            for i in range(len(y_t)):
                if query[i] != 0:
                    y_t[i] = x_list[time][i] * math.exp((d_t_hat/(2*n)) * query[i] * -eta) # eta is the learning rate
                else: 
                    y_t[i] = x_list[time][i]
                #print(f'{query[i]=}')
                
            
            #print(f'{y_t - x_t=}')
            
            # step b
            x_t = y_t / np.sum(y_t)
            update_count = update_list.count('yes')
            
            #print(f'{y_t=}')
            #print(f'{sum(y_t)=}')
            #print(f'{sum(x_t)=}')
            
            # if threshold for num updates is reached, just do a lazy round (synthetic database) answer
            if total_k == 0: 
                if show_failure_step:
                    print(f'Failure mode reached at query number {time}: {query}')
                lazy_round()
                
            # if there are still update steps that the analyst can use, 
            # 1. update the synthetic database
            # 2. answer the query using the noisy answer from the database itself 
            else: 
                x_list.append(x_t.round(3))
                update_list.append('yes') # increment number of updates counter
                answer = a_t_hat / np.sum(x)
                
                if answer < 0:
                    pmw_answers.append(0)
                else: 
                    pmw_answers.append(answer)
                
                total_k -= 1 # use one of the total update steps
        
        #print(f'{x_list[time] - x_list[time - 1]=}')
        
        
    update_count = update_list.count('yes')      

    # calculate error
    real_ans = np.matmul(workload, x_norm)
    abs_error = np.abs(pmw_answers - real_ans)
    rel_error = np.abs(abs_error / np.where(real_ans == 0, 0.000001,
                                                real_ans))
    
    if show_messages:
        np.set_printoptions(suppress=True)
        """Print inputes/outputs to analyze each query"""
        print(f'Original database: {x}\n')
        print(f'Normalized database: {x_norm}\n')
        print(f'Updated Database = {x_list[len(x_list) - 1]}\n')
        print(f'Update Count = {update_count}\n')
        print(f'{T=}\n')
        print(f'Error Scale Query Answer= {2*((2*total_k/eps)**2)}\n')
        print(f'Error Scale SVT= {2*((2*total_k/SVTepsilon2)**2)}\n')
        print(f'Update Parameter Scale = {eta}\n')
        print(f'{delta=}\n')
        
    if show_plot: 
        plt.title('Error across queries:')
        rel_line, = plt.plot(rel_error, label='Relative Error')
        abs_line, = plt.plot(abs_error, label='Absolute Error')
        for xc in update_times:
            plt.axvline(x=xc, color='red', label='Update Times', linestyle='dashed')
        plt.legend(handles=[abs_line,rel_line])
        plt.xticks(range(0, len(workload), round(len(workload)/5)))
    
    if to_return == "pd":
        # hacky fix: remove the first synthetic database to keep length of lists consistent with the
        # other lists that comprise of the pandas dataframe
        x_list.pop(0).tolist() 
        d = {
            'algo_ans': pmw_answers,
            'real_ans': real_ans.tolist(),
            'queries': workload.tolist(), 
            'updated': update_list,
            'abs_error': abs_error,               
            'rel_error': rel_error,
            'synthetic database': x_list,
            'analyst': analyst_labels,
            'd_t_hat': d_t_hat_list, 

             }
        test_data = pd.DataFrame(data=d)
        #test_data = test_data.round(3)
        return test_data
    
    if to_return == "error":
        d = {'analyst': analyst_labels,
             'abs_error': abs_error,               
             'rel_error': rel_error,}
        data = pd.DataFrame(data=d)
        data = data.round(3)
        
        analyst_error = {}
        for analyst in list(sorted(analyst_labels)):
            analyst_error[analyst] = data[data.analyst==analyst]['abs_error'].sum()
        return analyst_error
    
#pmw_naive(np.vstack((online_workloads.identity(5), online_workloads.identity(5), online_workloads.identity(5))), 
#     x_example, ['A'] * 5 + ['B'] * 5 + ['C'] * 5, eps=1, T=40) #, total_k = 5)


In [129]:
pmw_naive(fiftyq, 
     x_example, ['A'] * 50, eps=1, T=400) #, total_k = 5)

total_k=16


Unnamed: 0,algo_ans,real_ans,queries,updated,abs_error,rel_error,synthetic database,analyst,d_t_hat
0,0.065281,0.066667,"[1.0, 0.0, 0.0, 0.0, 0.0]",yes,0.001386,0.020788,"[0.2, 0.2, 0.2, 0.2, 0.2]",A,-2020.787777
1,0.130961,0.133333,"[0.0, 1.0, 0.0, 0.0, 0.0]",yes,0.002372,0.01779,"[0.2, 0.2, 0.2, 0.2, 0.2]",A,-1035.579166
2,0.2,0.2,"[0.0, 0.0, 1.0, 0.0, 0.0]",no,0.0,0.0,"[0.2, 0.2, 0.2, 0.2, 0.2]",A,33.327986
3,0.263145,0.266667,"[0.0, 0.0, 0.0, 1.0, 0.0]",yes,0.003522,0.013207,"[0.2, 0.2, 0.2, 0.2, 0.2]",A,947.170819
4,0.333735,0.333333,"[0.0, 0.0, 0.0, 0.0, 1.0]",yes,0.000402,0.001206,"[0.2, 0.2, 0.2, 0.2, 0.2]",A,2006.027687
5,0.067475,0.066667,"[1.0, 0.0, 0.0, 0.0, 0.0]",yes,0.000809,0.01213,"[0.2, 0.2, 0.2, 0.2, 0.2]",A,-1987.870477
6,0.128778,0.133333,"[0.0, 1.0, 0.0, 0.0, 0.0]",yes,0.004555,0.034162,"[0.2, 0.2, 0.2, 0.2, 0.2]",A,-1068.324915
7,0.2,0.2,"[0.0, 0.0, 1.0, 0.0, 0.0]",no,0.0,0.0,"[0.2, 0.2, 0.2, 0.2, 0.2]",A,-29.444815
8,0.267265,0.266667,"[0.0, 0.0, 0.0, 1.0, 0.0]",yes,0.000599,0.002245,"[0.2, 0.2, 0.2, 0.2, 0.2]",A,1008.981228
9,0.334697,0.333333,"[0.0, 0.0, 0.0, 0.0, 1.0]",yes,0.001364,0.004092,"[0.2, 0.2, 0.2, 0.2, 0.2]",A,2020.458194


In [17]:
# try identity on the race database
pmw_naive(np.vstack((online_workloads.identity(86), online_workloads.identity(86))),
     x_race, ['A'] * 172, eps=10, T=40, total_k = 86, show_messages=True)

Original database: [412 333 285 231 202 174 160 142 146 149 145 181 174 190 213 287 372 499
 619 715 785 821 822 816 799 742 717 697 658 593 564 519 447 403 388 365
 336 306 311 289 261 231 213 196 194 170 175 168 149 142 131 119 112 118
 114 116 112 114 106 111 109 112 113 109 104 108 108  94  91  81  81  72
  68  63  56  46  41  38  34  28  23  22  18  18  16  41]

Normalized database: [0.01893992 0.01530823 0.01310164 0.01061922 0.00928608 0.0079989
 0.00735531 0.00652784 0.00671172 0.00684963 0.00666575 0.00832069
 0.0079989  0.00873443 0.00979175 0.01319358 0.01710109 0.02293936
 0.02845585 0.03286903 0.03608698 0.03774192 0.03778789 0.03751207
 0.03673057 0.03411024 0.03296097 0.03204156 0.0302487  0.02726061
 0.02592746 0.02385878 0.02054889 0.01852618 0.01783662 0.01677929
 0.01544615 0.01406703 0.01429688 0.01328552 0.01199835 0.01061922
 0.00979175 0.00901025 0.00891831 0.00781501 0.00804487 0.00772307
 0.00684963 0.00652784 0.00602216 0.00547051 0.00514872 0.00542454
 0.0052

Unnamed: 0,algo_ans,real_ans,queries,updated,abs_error,rel_error,synthetic database,analyst,d_t_hat
0,0.021008,0.018940,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",yes,0.002068,0.109193,"[0.0, 0.012, 0.012, 0.012, 0.012, 0.012, 0.012...",A,204.045742
1,0.015685,0.015308,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",yes,0.000377,0.024611,"[0.0, 0.0, 0.012, 0.012, 0.012, 0.012, 0.012, ...",A,80.159556
2,0.013293,0.013102,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",yes,0.000192,0.014633,"[0.0, 0.0, 0.0, 0.012, 0.012, 0.012, 0.012, 0....",A,28.134375
3,0.010099,0.010619,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",yes,0.000520,0.048948,"[0.0, 0.0, 0.0, 0.0, 0.012, 0.012, 0.012, 0.01...",A,-41.342902
4,0.012000,0.009286,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",no,0.002714,0.292257,"[0.0, 0.0, 0.0, 0.0, 0.012, 0.012, 0.012, 0.01...",A,-9.094051
...,...,...,...,...,...,...,...,...,...
167,0.001000,0.001011,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",no,0.000011,0.011227,"[0.0, 0.0, 0.0, 0.0, 0.002, 0.0, 0.0, 0.0, 0.0...",A,0.247000
168,0.001000,0.000827,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",no,0.000173,0.208500,"[0.0, 0.0, 0.0, 0.0, 0.002, 0.0, 0.0, 0.0, 0.0...",A,-3.753000
169,0.001000,0.000827,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",no,0.000173,0.208500,"[0.0, 0.0, 0.0, 0.0, 0.002, 0.0, 0.0, 0.0, 0.0...",A,-3.753000
170,0.002000,0.000736,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",no,0.001264,1.719125,"[0.0, 0.0, 0.0, 0.0, 0.002, 0.0, 0.0, 0.0, 0.0...",A,-27.506000


In [18]:
x_race_first_ten / sum(x_race_first_ten)

array([0.18442256, 0.14905998, 0.12757386, 0.10340197, 0.09042077,
       0.0778872 , 0.07162041, 0.06356312, 0.06535363, 0.06669651])

In [19]:
#Updated Database = [0.108 0.104 0.101 0.099 0.099 0.098 0.098 0.098 0.097 0.097]


In [20]:
x = np.array([1000, 2000, 3000, 4000, 5000])
x / sum(x)

array([0.06666667, 0.13333333, 0.2       , 0.26666667, 0.33333333])

In [21]:
pmw_naive(fiftyq, 
     x_example, ['A'] * 50, eps=10, T=40, total_k = 25)

Unnamed: 0,algo_ans,real_ans,queries,updated,abs_error,rel_error,synthetic database,analyst,d_t_hat
0,0.066633,0.066667,"[1.0, 0.0, 0.0, 0.0, 0.0]",yes,3.4e-05,0.000511,"[0.002, 0.249, 0.249, 0.249, 0.249]",A,-2000.510952
1,0.132606,0.133333,"[0.0, 1.0, 0.0, 0.0, 0.0]",yes,0.000727,0.005455,"[0.003, 0.003, 0.331, 0.331, 0.331]",A,-1745.910291
2,0.199699,0.2,"[0.0, 0.0, 1.0, 0.0, 0.0]",yes,0.000301,0.001507,"[0.004, 0.004, 0.004, 0.493, 0.493]",A,-1969.521084
3,0.266221,0.266667,"[0.0, 0.0, 0.0, 1.0, 0.0]",yes,0.000446,0.001671,"[0.008, 0.008, 0.008, 0.008, 0.968]",A,-3401.683237
4,0.333232,0.333333,"[0.0, 0.0, 0.0, 0.0, 1.0]",yes,0.000101,0.000304,"[0.208, 0.208, 0.208, 0.208, 0.168]",A,-9521.519608
5,0.066681,0.066667,"[1.0, 0.0, 0.0, 0.0, 0.0]",yes,1.5e-05,0.000219,"[0.002, 0.262, 0.262, 0.262, 0.212]",A,-2119.781456
6,0.132514,0.133333,"[0.0, 1.0, 0.0, 0.0, 0.0]",yes,0.000819,0.006143,"[0.003, 0.003, 0.354, 0.354, 0.286]",A,-1942.286346
7,0.20009,0.2,"[0.0, 0.0, 1.0, 0.0, 0.0]",yes,9e-05,0.000451,"[0.005, 0.005, 0.005, 0.545, 0.441]",A,-2308.647916
8,0.26664,0.266667,"[0.0, 0.0, 0.0, 1.0, 0.0]",yes,2.6e-05,9.9e-05,"[0.011, 0.011, 0.011, 0.009, 0.958]",A,-4175.39498
9,0.333242,0.333333,"[0.0, 0.0, 0.0, 0.0, 1.0]",yes,9.1e-05,0.000274,"[0.227, 0.227, 0.227, 0.186, 0.133]",A,-9371.369698


In [22]:
#array([0.06666667, 0.13333333, 0.2       , 0.26666667, 0.33333333])

In [23]:
x_race / sum(x_race)

array([0.01893992, 0.01530823, 0.01310164, 0.01061922, 0.00928608,
       0.0079989 , 0.00735531, 0.00652784, 0.00671172, 0.00684963,
       0.00666575, 0.00832069, 0.0079989 , 0.00873443, 0.00979175,
       0.01319358, 0.01710109, 0.02293936, 0.02845585, 0.03286903,
       0.03608698, 0.03774192, 0.03778789, 0.03751207, 0.03673057,
       0.03411024, 0.03296097, 0.03204156, 0.0302487 , 0.02726061,
       0.02592746, 0.02385878, 0.02054889, 0.01852618, 0.01783662,
       0.01677929, 0.01544615, 0.01406703, 0.01429688, 0.01328552,
       0.01199835, 0.01061922, 0.00979175, 0.00901025, 0.00891831,
       0.00781501, 0.00804487, 0.00772307, 0.00684963, 0.00652784,
       0.00602216, 0.00547051, 0.00514872, 0.00542454, 0.00524066,
       0.0053326 , 0.00514872, 0.00524066, 0.00487289, 0.00510274,
       0.0050108 , 0.00514872, 0.00519469, 0.0050108 , 0.00478095,
       0.00496483, 0.00496483, 0.00432124, 0.00418333, 0.00372362,
       0.00372362, 0.00330989, 0.00312601, 0.00289615, 0.00257

In [24]:
W_lst = [online_workloads.identity(n), online_workloads.total(n), online_workloads.race1(), online_workloads.race2(), online_workloads.race3(), online_workloads.custom(n), online_workloads.prefix_sum(n)]


In [25]:
online_workloads.identity(86)

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [26]:
pmw_naive(fiftyq, 
     x_example, ['A'] * 50, eps=.1, T=40, total_k = 25)

Unnamed: 0,algo_ans,real_ans,queries,updated,abs_error,rel_error,synthetic database,analyst,d_t_hat
0,0.0,0.066667,"[1.0, 0.0, 0.0, 0.0, 0.0]",yes,0.066667,1.0,"[0.002, 0.249, 0.249, 0.249, 0.249]",A,-3389.43519
1,0.167701,0.133333,"[0.0, 1.0, 0.0, 0.0, 0.0]",yes,0.034367,0.257755,"[0.003, 0.003, 0.331, 0.331, 0.331]",A,-1219.489063
2,0.223998,0.2,"[0.0, 0.0, 1.0, 0.0, 0.0]",yes,0.023998,0.119992,"[0.004, 0.004, 0.004, 0.493, 0.493]",A,-1605.023515
3,0.23778,0.266667,"[0.0, 0.0, 0.0, 1.0, 0.0]",yes,0.028886,0.108323,"[0.008, 0.008, 0.008, 0.008, 0.969]",A,-3828.292576
4,0.272937,0.333333,"[0.0, 0.0, 0.0, 0.0, 1.0]",yes,0.060396,0.181189,"[0.209, 0.209, 0.209, 0.209, 0.164]",A,-10440.944403
5,0.064022,0.066667,"[1.0, 0.0, 0.0, 0.0, 0.0]",yes,0.002644,0.039666,"[0.002, 0.264, 0.264, 0.264, 0.207]",A,-2174.665721
6,0.144777,0.133333,"[0.0, 1.0, 0.0, 0.0, 0.0]",yes,0.011444,0.085829,"[0.003, 0.003, 0.357, 0.357, 0.28]",A,-1788.342282
7,0.184717,0.2,"[0.0, 0.0, 1.0, 0.0, 0.0]",yes,0.015283,0.076413,"[0.005, 0.005, 0.005, 0.553, 0.433]",A,-2584.237587
8,0.263857,0.266667,"[0.0, 0.0, 0.0, 1.0, 0.0]",yes,0.002809,0.010535,"[0.011, 0.011, 0.011, 0.01, 0.957]",A,-4337.138371
9,0.35897,0.333333,"[0.0, 0.0, 0.0, 0.0, 1.0]",yes,0.025637,0.076911,"[0.222, 0.222, 0.222, 0.202, 0.132]",A,-8970.442864


PMW independent

Initialize an instance of PMW for each analyst with alpha =T and their share of the privacy budget
Analysts don’t share anything - PB or Synthetic database
Each analyst is only allowed to query their instance of PMW

Naive PMW

Initialize an instance of PMW with alpha = T and the whole privacy budget 
All analysts sharing everything - privacy budget and synthetic database
Allow every analyst to query that instance of PMW

Split PMW

Initialize a single instance of PMW for each analyst with alpha = T and the entire privacy budget
Split the update steps proportionally to each analyst based on their share of the privacy budget
There exists cases where some analysts have more privacy budgets than others - i.e. alice owns 50 percent of the data
The difference between split and PMW - is that in Split, everyone shares a synthetic database
Inference steps are infamously non-monotonic
Allow any analyst to answer from the PMW instance and only allow them to cause an update step if they own any unused update steps


In [27]:
W_lst = [online_workloads.identity(n), online_workloads.total(n), online_workloads.race1(), online_workloads.race2(), online_workloads.race3(), online_workloads.custom(n), online_workloads.prefix_sum(n)]


In [28]:
online_workloads.identity(n)

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [106]:
k_max = 20
import random
# random item from list
num_analysts = random.choice(range(2, k_max + 1))

Ws = 

for analyst in num_analysts: 
    # stack analysts
    

4
