# DAY 11 [18.3%]

# Create a Differentially Private Query - Project

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

In [2]:
np.random.seed(5)

In [4]:
def create_databases(db, num_databases = 5000):
    """
    This function create a X number of databases from db,
    where each new database has one missing row (element).
    
    params:
    ------

    db            -- Base database.
    num_databases -- Number of database to create.
    
    returns
    -------
    List of databases.
    """
    databases = []
    for k in range(0, num_databases):
        databases.append(torch.cat([db[:k], db[k+1:]]))
    return databases

In [5]:
def create_db_and_parallels(num_entries):
    """
    Create parallel databases.
    
    params:
    ------
    num_entries  -- Number of samples.
    
    returns:
    -------
    db           -- A single database.
    databases    -- A database list.
    """
    db = torch.rand(num_entries) > 0.5
    databases = create_databases(db, num_entries)
    return db, databases

In [6]:
def create_dbs(num_entries):
    """
    Creates two databases using a random coin flip.
    
    params:
    ------
    num_entries    -- Total samples.
    
    returns:
    -------
    db             -- Normal data base,
    db_            -- Database created by coin flips.
    """
    db = torch.rand(num_entries) > 0.5
    db_ = db.data.clone()
    first_coin_flip = (torch.rand(len(db)) > .5).float()
    second_coin_flip = (torch.rand(len(db)) > .5).float()
    db_ = db.float() * first_coin_flip + (1 - first_coin_flip) * second_coin_flip
    return db, db_

In [13]:
def threshold_query(db, threshold = 5):
    # You need to cast this to float db.sum() > threshold
    # otherwise it will recieve only binary values.
    return (db.sum() > threshold).float()

In [14]:
def sensitivity(query, n_entries):
    # Initialize a database and parallel databases
    db, databases = create_db_and_parallels(n_entries)
    # run query over the original db
    #if flip() == 1: # tails
    full_query_db = query(db)
    # run query over the databases
    sensitivity = 0
    for index, pdb in enumerate(databases):
        pbd_result = query(pdb)
        db_distance = torch.abs(pbd_result - full_query_db) #L1 sensitity
        if db_distance > sensitivity:
            sensitivity = db_distance
        print('For pdb{}, the sensitivity is {}'.format(index + 1, db_distance))
    return sensitivity

In [15]:
def global_privacy(query, n_entries, epsilon):
    # create the databse
    # get the sensitivity
    sensitivity, db = calculate_sensitivity(query, n_entries)
    # calculate delta for laplacian
    scaled_delta = sensitivity/epsilon
    # set laplacian
    laplacian_noise = np.random.laplace(0, scaled_delta)
    # perform the query with noise
    result = query(db)
    noise_result = query(db) + laplacian_noise
    print('Original query: {}'.format(result))
    print('Query with laplacian noise: {}'.format(noise_result))

In [16]:
def sum_query(db):
    # You need to cast this to float db.sum() > threshold
    # otherwise it will recieve only binary values.
    return (db.sum()).float()

In [17]:
def mean_query(db):
    return db.float().mean()

In [18]:
def create_db_and_parallels(num_entries):
    db = torch.rand(num_entries) > 0.5
    databases = create_databases(db, num_entries)
    return db, databases

In [19]:
def calculate_sensitivity(query, n_entries):
    # Initialize a database and parallel databases
    db, databases = create_db_and_parallels(n_entries)
    # run query over the original db
    #if flip() == 1: # tails
        
    full_query_db = query(db)
    # run query over the databases
    sensitivity = 0
    for index, pdb in enumerate(databases):
        pbd_result = query(pdb)
        db_distance = torch.abs(pbd_result - full_query_db) #L1 sensitity
        if db_distance > sensitivity:
            sensitivity = db_distance
        #print('For pdb{}, the sensitivity is {}'.format(index + 1, db_distance))
    return sensitivity, db

In [20]:
sensitivity(sum_query, n_entries = 10)

For pdb1, the sensitivity is 1.0
For pdb2, the sensitivity is 1.0
For pdb3, the sensitivity is 1.0
For pdb4, the sensitivity is 1.0
For pdb5, the sensitivity is 1.0
For pdb6, the sensitivity is 0.0
For pdb7, the sensitivity is 1.0
For pdb8, the sensitivity is 1.0
For pdb9, the sensitivity is 0.0
For pdb10, the sensitivity is 1.0


tensor(1.)

In [21]:
epsilon = 0.5 # how much information to leak. Here it says leaks about 50% of information.
# If we set e to 0.2, it would mean, leak 20% of information.
# Lower the epsilon, greater the noise.
# Also, epsilon is refered as the privacy budget, it could be a greater number like 5.
# In that case one can assing a portion of epsilon for each query.

In [22]:
global_privacy(sum_query, n_entries = 100, epsilon = 0.5)

Original query: 46.0
Query with laplacian noise: 44.376075744628906


In [23]:
global_privacy(sum_query, n_entries = 100, epsilon = 0.00001)

Original query: 48.0
Query with laplacian noise: 135320.265625


In [24]:
global_privacy(mean_query, n_entries = 100, epsilon = 0.5)

Original query: 0.4699999988079071
Query with laplacian noise: 0.4605430066585541


In [25]:
global_privacy(mean_query, n_entries = 100, epsilon = 0.001)

Original query: 0.4699999988079071
Query with laplacian noise: 10.188638687133789
