In [56]:
import numpy as np
import pandas as pd
import math

In [58]:
# Todo: Add code to make epsilon adjustable proportionally. 
# Todo: Fix divide by zero error (<ipython-input-51-46063bd03d8a>:37: RuntimeWarning: divide by zero encountered in double_scalars delta = 1 / (n * math.log(n, np.e)))

def pmw_split(workload, x, T, eps=0.01, k=0, analyst_labels, 
         show_messages=True, to_return='pd', show_plot=False, show_failure_step=True):
    """
    Implement Private Multiplicative Weights Mechanism (PMW) on a workload of
    linear queries. 

    Algorithm Parameters: 
    - workload = workload of queries (M x k numpy array)
    - x = true database (M x 1 numpy array)
    - T = update threshold
    - eps = privacy budget
    - k = number of update steps PER ANALYST
    - analyst_labels = list of analyst names corresponding to each query in the workload
    
    Output Controls: 
    - show_messages argument determines whether the function will print information such as 
    error scale, threshold, update steps used, etc.
    - to_return argument determines what the function will return. 
        - if 'pd', pmw() returns pandas df with test data for each 
        query in the workload(showing query, d_t_hat, updated, algo_ans, real_ans, 
        abs_error, rel_error). 
        - if 'update_count', pmw() returns the update count for the total
        amount of queries.
    - show_plot - T/F whether the function will display a plot
    - show_failure_step - T/F whether function prints what step failure mode is reached
    """ 
    
    update_steps = {}
    for analyst in list(set(analyst_labels)): 
        update_steps[analyst] = k # each analyst starts with k update steps
    
    # initialize constants
    m = x.size  # database len
    n = x.sum()
    eta = (math.log(m, np.e) ** (1 / 4)) / (math.sqrt(n))
    delta = 1 / (n * math.log(n, np.e))
    x_norm = x / np.sum(x)
    
    # initialize synthetic databases at time 0 (prior to any queries)
    x_t = np.ones(m) / m
    y_t = np.ones(m) / m

    # initialize tracker lists to construct pandas dataframe at the end 
    x_list = [x_t] # create a list of x_t synthetic database at every time step
    update_list = []
    update_count = 0
    pmw_answers = []
    update_times = [] # record times that database is updated
    d_t_hat_list = []
    
    def lazy_round():
        """
        "Lazy Round" of querying using the stored synthetic database, x_t, in list x_list.
        
        We call this the lazy round because it is contrasted with the updated step where we update the 
        sythetic database and answer the query using the real database.
        """
        update_list.append('no')
        pmw_answers.append(np.dot(query, x_list[time]))
        x_list.append(x_list[time].round(3))
    
    # inititate first instance of SVT with half the budget and k updates; will be reset in the main loop
    SVTtrigger = False 
    SVTepsilon1 = ((eps/2)/2)
    SVTepsilon2 = ((eps/2)/2)
    rho = np.random.laplace(loc=0, scale=(1/SVTepsilon1), size=1)[0]
    
    for time, query in enumerate(workload):
        
        analyst = analyst_labels[time]
        
        # Do one round of sparse vector technique 
        
        # Compute noisy answer by adding Laplacian noise
        a_t = np.random.laplace(loc=0, scale=(2*k/SVTepsilon2), size=1)[0]
        a_t_hat = (np.dot(query, x_norm)*n ) + a_t

        # Difference between noisy and maintained histogram answer
        d_t_hat = a_t_hat - (n*np.dot(query, x_list[time]))
        
        # Lazy round: use synthetic base to answer the query
        if (abs(d_t_hat) <= T + rho):
            d_t_hat_list.append(d_t_hat)
            lazy_round()
            continue

        # update round: update histogram and return noisy answer
        else:
            #make a new noisy query answer using some of the leftover budget
            a_t = np.random.laplace(loc=0, scale=(2*k/eps), size=1)[0]
            a_t_hat = (np.dot(query, x_norm)*n ) + a_t
            d_t_hat = a_t_hat - (n*np.dot(query, x_list[time]))
            d_t_hat_list.append(d_t_hat)
            update_times.append(time)
            
            # step a
            if d_t_hat < 0:
                r_t = query
            else:
                r_t = np.ones(m) - query
            for i, v in enumerate(y_t):
                y_t[i] = x_list[time][i] * math.exp((d_t_hat/(2*n)) * query[i]) * 20 # 20 is the learning rate
            
            # step b
            x_t = y_t / np.sum(y_t)
            update_count = update_list.count('yes')
            
            # if threshold for num updates is reached, just do a lazy round (synthetic database) answer
            if update_steps[analyst] == 0: 
                if show_failure_step:
                    print(f'Failure mode reached at query number {time}: {query}')
                lazy_round()
                
            # if there are still update steps that the analyst can use, 
            # 1. update the synthetic database
            # 2. answer the query using the noisy answer from the database itself 
            else: 
                x_list.append(x_t.round(3))
                update_list.append('yes') # increment number of updates counter
                pmw_answers.append(a_t_hat / np.sum(x))
                update_steps[analyst] -= 1 # use one of analyst's update steps

    update_count = update_list.count('yes')      

    # calculate error
    real_ans = np.matmul(workload, x_norm)
    abs_error = np.abs(pmw_answers - real_ans)
    rel_error = np.abs(abs_error / np.where(real_ans == 0, 0.000001,
                                                real_ans))
    
    if show_messages:
        np.set_printoptions(suppress=True)
        """Print inputes/outputs to analyze each query"""
        print(f'Original database: {x}\n')
        print(f'Normalized database: {x_norm}\n')
        print(f'Updated Database = {x_t}\n')
        print(f'Update Count = {update_count}\n')
        print(f'{T=}\n')
        print(f'Error Scale Query Answer= {2*((2*k/eps)**2)}\n')
        print(f'Error Scale SVT= {2*((2*k/SVTepsilon2)**2)}\n')
        print(f'Update Parameter Scale = {eta}\n')
        print(f'{delta=}\n')
        
    if show_plot: 
        plt.title('Error across queries:')
        rel_line, = plt.plot(rel_error, label='Relative Error')
        abs_line, = plt.plot(abs_error, label='Absolute Error')
        for xc in update_times:
            plt.axvline(x=xc, color='red', label='Update Times', linestyle='dashed')
        plt.legend(handles=[abs_line,rel_line])
        plt.xticks(range(0, len(workload), round(len(workload)/5)))
    
    if to_return == "pd":
        # hacky fix: remove the first synthetic database to keep length of lists consistent with the
        # other lists that comprise of the pandas dataframe
        x_list.pop(0).tolist() 
        d = {
            'algo_ans': pmw_answers,
            'real_ans': real_ans.tolist(),
            'queries': workload.tolist(), 
            'updated': update_list,
            'abs_error': abs_error,               
            'rel_error': rel_error,
            'synthetic database': x_list,
            'analyst': analyst_labels,
            'd_t_hat': d_t_hat_list, 

             }
        test_data = pd.DataFrame(data=d)
        test_data = test_data.round(3)
        return test_data
    
    # return dictionary of absolute errors
    if to_return == "error":
        d = {'analyst': analyst_labels,
             'abs_error': abs_error,               
             'rel_error': rel_error,}
        data = pd.DataFrame(data=d)
        data = data.round(3)
        
        analyst_error = {}
        for analyst in list(set(analyst_labels)):
            analyst_error[analyst] = data[data.analyst==analyst]['abs_error'].sum()
        return analyst_error

SyntaxError: non-default argument follows default argument (<ipython-input-58-4930b2d6219e>, line 2)

In [63]:
# pmw_independent: write pmw for one person. 
# create wrapper function called pmw_independent() that takes in the workloads and workload labels. Run PMW for each analyst, separate their workloads based on analysts. 

database=np.array([.2, .8])

def pmw_independent(database):

    analyst_labels = ['Alice', 'Bob', 'Alice', 'Bob']
    w = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])

    indices = {} # k: analyst, v: row indices of queries in the workloads
    for i, analyst in enumerate(analyst_labels):
        if analyst not in indices.keys(): 
            indices[analyst] = []
        indices[analyst].append(i)

    workloads = {} # k: analyst, v: the analyst's workload
    for analyst in indices.keys():
        workloads[analyst] = w[indices['Alice'], :]

    for analyst in workloads.keys():
        print(i)
        pmw_split(workload=workloads[analyst], x=database, T=5, analyst_labels=[analyst]*len(workloads[analyst]))
        
pmw_independent(database)
    

3
Original database: [0.2 0.8]

Normalized database: [0.2 0.8]

Updated Database = [0.5 0.5]

Update Count = 0

T=5

Error Scale Query Answer= 0.0

Error Scale SVT= 0.0

Update Parameter Scale = 0.9124443057840286

delta=inf

3
Failure mode reached at query number 0: [1 2]
Failure mode reached at query number 1: [5 6]
Original database: [0.2 0.8]

Normalized database: [0.2 0.8]

Updated Database = [0.46257015 0.53742985]

Update Count = 0

T=5

Error Scale Query Answer= 0.0

Error Scale SVT= 0.0

Update Parameter Scale = 0.9124443057840286

delta=inf



  delta = 1 / (n * math.log(n, np.e))


In [7]:
# pmw_naive
# write version of PMW where analysts can run out of privacy budget if they use too much of others' budgets
# 

PMW independent

Initialize an instance of PMW for each analyst with alpha =T and their share of the privacy budget
Analysts don’t share anything - PB or Synthetic database
Each analyst is only allowed to query their instance of PMW

Naive PMW

Initialize an instance of PMW with alpha = T and the whole privacy budget 
All analysts sharing everything - privacy budget and synthetic database
Allow every analyst to query that instance of PMW

Split PMW

Initialize a single instance of PMW for each analyst with alpha = T and the entire privacy budget
Split the update steps proportionally to each analyst based on their share of the privacy budget
There exists cases where some analysts have more privacy budgets than others - i.e. alice owns 50 percent of the data
The difference between split and PMW - is that in Split, everyone shares a synthetic database
Inference steps are infamously non-monotonic
Allow any analyst to answer from the PMW instance and only allow them to cause an update step if they own any unused update steps
