In [1]:
%pip install opendp
import opendp.prelude as dp
dp.enable_features("contrib")

Note: you may need to restart the kernel to use updated packages.


# Début début


## Pour un simple float

In [2]:
# call the constructor to produce the measurement `base_lap`
base_lap = dp.m.make_laplace(
    dp.atom_domain(T=float),
    dp.absolute_distance(T=float),
    scale=5.
)

print("input domain:  ", base_lap.input_domain)
print("input metric:  ", base_lap.input_metric)
print("output measure:", base_lap.output_measure)

aggregate = 0.
print("noisy aggregate:", base_lap(aggregate))

absolute_distance = 10.
print("epsilon:", base_lap.map(d_in=absolute_distance))

input domain:   AtomDomain(T=f64)
input metric:   AbsoluteDistance(f64)
output measure: MaxDivergence(f64)
noisy aggregate: 2.3433163220941946
epsilon: 2.0


## Pour un vecteur de float

In [None]:
# call the constructor to produce the transformation `bounded_sum`
# notice that `make_sum` expects an input domain consisting of bounded data:

input_domain = dp.vector_domain(dp.atom_domain(bounds=(0., 5.)))
bounded_sum = dp.t.make_sum(input_domain, dp.symmetric_distance())

# under the condition that the input data is a member of the input domain...
bounded_mock_dataset = [1.3, 3.8, 0., 5.]
print("The exact sum is :",bounded_sum(bounded_mock_dataset))

# under the condition that one individual may contribute up to 2 records to `bounded_mock_dataset`...
max_contributions = 2

# ...then the sensitivity, expressed in terms of the absolute distance, is:
bounded_sum.map(d_in=max_contributions)

The exact sum is : 10.1


10.000000046566134

In [4]:
input_domain = dp.vector_domain(dp.atom_domain(T=float))
input_metric = dp.symmetric_distance()

# call the constructor to produce the transformation `clamp`
clamp = dp.t.make_clamp(input_domain, input_metric, bounds=(0., 5.))

# `clamp` expects vectors of non-null, unbounded elements
mock_dataset = [1.3, 7.8, -2.5, 7.0]

# `clamp` emits data that is suitable for `bounded_sum`
clamp(mock_dataset)

dp_sum = clamp >> bounded_sum >> base_lap

# compute the DP sum of a dataset of bounded elements
print("DP sum:", dp_sum(mock_dataset))

# evaluate the privacy loss of the dp_sum, when an individual can contribute at most 2 records
print("epsilon:", dp_sum.map(d_in=max_contributions))

DP sum: 0.7429862245692711
epsilon: 2.000000009313227


## De manière plus compacte

In [5]:
# establish public info
max_contributions = 2
bounds = (0., 5.)

# construct the measurement

input_domain = dp.vector_domain(dp.atom_domain(T=float))
input_metric = dp.symmetric_distance()

dp_sum = (
    dp.t.make_clamp(input_domain, input_metric, bounds) >>
    dp.t.make_sum(dp.vector_domain(dp.atom_domain(bounds=bounds)), input_metric) >>
    dp.m.make_laplace(dp.atom_domain(T=float), dp.absolute_distance(T=float), 5.)
)
# Ou
dp_sum = (
    (input_domain, input_metric) >>
    dp.t.then_clamp(bounds) >>
    dp.t.then_sum() >>
    dp.m.then_laplace(5.)
)

# evaluate the privacy expenditure and make a DP release
mock_dataset = [0.7, -0.3, 1., -1.]

print("epsilon:", dp_sum.map(max_contributions))
print("DP sum release:", dp_sum(mock_dataset))

epsilon: 2.000000009313227
DP sum release: 1.362964970783503


Cas d'une moyenne

In [6]:
# establish public info
max_contributions = 2
bounds = (0., 5.)

# construct the measurement

input_domain = dp.vector_domain(dp.atom_domain(T=float), size=4) # Taille dataset connu
input_metric = dp.symmetric_distance()

dp_mean = (
    (input_domain, input_metric) >>
    dp.t.then_clamp(bounds) >>
    dp.t.then_mean() >>
    dp.m.then_laplace(5.)
)

# evaluate the privacy expenditure and make a DP release
mock_dataset = [0.7, -0.3, 1., -1.]

print("epsilon:", dp_mean.map(max_contributions))
print("DP mean release:", dp_mean(mock_dataset))

epsilon: 0.2500000000000011
DP mean release: 3.238619515671338


# Quickstart

## A savoir quand on utilise OpenDP sans Polars

Il faut définir l'espace dans lequel on travaille : float, integers, ect mais aussi le type de distance utilisée (absolute_distance si on travaille sur un float sinon symmetric_distance pour le cas d'un vecteur).

Il est recommandé d'utiliser un objet de type context

Après avoir défini l'espace on peut choisir la "transformation" qu'on souhaite utiliser : Sum, mean, quantile.
Mais aussi bien d'autres comme casting, imputation, 

Ensuite on ajoute la "measurements" qui est un bruit ou randomized response.

On peut également préciser d'autres fonctions comme le clamp (=clipping) ect

## Typical Workflow

1. Identify the Unit of Privacy

In [None]:
d_in = 1 # neighboring data set distance is at most d_in...
input_metric = dp.symmetric_distance() # ...in terms of additions/removals
input_domain = dp.vector_domain(dp.atom_domain(T=float))

(SymmetricDistance(), 1)


2. Set Privacy Loss Parameters

In [None]:
d_out = 1. # output distributions have distance at most d_out (ε)...
privacy_measure = dp.max_divergence(T="f64") # ...in terms of pure-DP

(MaxDivergence(f64), 1.0)


3. Collect Public Information

In [10]:
bounds = (0.0, 100.0)
imputed_value = 50.0

4. Mediate Access to Data

In [None]:
from random import randint

data = [float(randint(-100, 100)) for _ in range(100)]

m_sc = dp.c.make_sequential_composition(
    input_domain=input_domain,
    input_metric=input_metric,
    output_measure=privacy_measure,
    d_in=d_in,
    d_mids=[d_out / 3] * 3,
)

# Call measurement with data to create a queryable:
queryable = m_sc(data)

5. Submit DP Queries

Count (= len(data))

In [None]:
count_transformation = (
    dp.t.make_count(input_domain, input_metric)
)

count_sensitivity = count_transformation.map(d_in)
count_sensitivity

count_measurement = dp.binary_search_chain(
    lambda scale: count_transformation >> dp.m.then_laplace(scale),
    d_in,
    d_out / 3
)

dp_count = queryable(count_measurement)

3.0000000000000004
9.445721638273584
1
(-8.445721638273584, 10.445721638273584)


Mean

In [None]:
mean_transformation = (
    dp.t.make_clamp(input_domain, input_metric, bounds) >>
    dp.t.then_resize(size=dp_count, constant=imputed_value) >>
    dp.t.then_mean()
)

mean_measurement = dp.binary_search_chain(
    lambda scale: mean_transformation >> dp.m.then_laplace(scale), d_in, d_out / 3
)

dp_mean = queryable(mean_measurement)

-18.29622028217176


In [None]:
from functools import lru_cache

input_space = dp.vector_domain(dp.atom_domain(T=float)), input_metric

@lru_cache(maxsize=None)
def make_count_with(*, epsilon):
    counter = input_space >> dp.t.then_count()
    return dp.binary_search_chain(
        lambda s: counter >> dp.m.then_laplace(scale=s),
        d_in=1, d_out=epsilon,
        bounds=(0., 10000.))

@lru_cache(maxsize=None)
def make_mean_with(*, target_size, epsilon):
    mean_chain = (
        input_space >>
        # Resize the dataset to length `target_size`.
        #     If there are fewer than `target_size` rows in the data, fill with a constant.
        #     If there are more than `target_size` rows in the data, only keep `data_size` rows
        dp.t.then_resize(size=target_size, constant=age_prior) >>
        # Clamp age values
        dp.t.then_clamp(bounds=age_bounds) >>
        # Compute the mean
        dp.t.then_mean()
    )
    return dp.binary_search_chain(
        lambda s: mean_chain >> dp.m.then_laplace(scale=s),
        d_in=1, d_out=epsilon,
        bounds=(0., 10.))

@lru_cache(maxsize=None)
def make_sum_with(*, epsilon):
    bounded_age_sum = (
        input_space >>
        # Clamp income values
        dp.t.then_clamp(bounds=age_bounds) >>
        dp.t.then_sum()
    )
    return dp.binary_search_chain(
        lambda s: bounded_age_sum >> dp.m.then_laplace(scale=s),
        d_in=1, d_out=epsilon,
        bounds=(0., 1000.))

In [None]:
# Define parameters up-front
# Each parameter is either a guess, a DP release, or public information
var_names = ["age", "sex", "educ", "race", "income", "married"] # public information
age_bounds = (0., 120.) # an educated guess
age_prior = 38. # average age for entire US population (public information)
size = 1000 # records in dataset, public information

# Load data
import opendp.prelude as dp
import numpy as np
age = np.genfromtxt(dp.examples.get_california_pums_path(), delimiter=',', names=var_names)[:]['age'].tolist() # type: ignor

In [54]:
import numpy as np
dp.enable_features("honest-but-curious")

def make_sum(bounds):
    L, U = bounds
    return dp.t.make_user_transformation(
        input_domain = dp.vector_domain(dp.atom_domain(bounds=bounds)),
        input_metric = dp.symmetric_distance(),
        output_domain = dp.atom_domain(T=int),
        output_metric= dp.absolute_distance(T=int),
        function=sum,
        stability_map= lambda d_in: d_in * max(abs(L), U)
    )

def make_laplace(scale):
    L, U = bounds
    return dp.m.make_user_measurement(
        input_domain = dp.atom_domain(T=int),
        input_metric = dp.absolute_distance(T=int),
        output_measure = dp.max_divergence(T=int),
        function=lambda x: np.random.laplace(loc=x, scale=scale),
        privacy_map= lambda d_in: d_in / scale
    )

m_sum = make_sum((0, 1))

m_lap = make_laplace(scale=1.)
m_lap(10)
m_sum([1,4,5])

10