In [110]:
%pip install lomas-client

Note: you may need to restart the kernel to use updated packages.


In [111]:
# Step 1
from lomas_client import Client
import numpy as np
import opendp.prelude as dp

# Step 2
APP_URL = "https://user-stuartbenoliel-490979-lomas-server-user.lab.sspcloud.fr"
USER_NAME = "Dr. Antartica"
DATASET_NAME = "PENGUIN"
client = Client(url=APP_URL, user_name = USER_NAME, dataset_name = DATASET_NAME)

# Step 3

In [112]:
penguin_metadata = client.get_dataset_metadata()
penguin_metadata

{'max_ids': 1,
 'rows': 344,
 'row_privacy': True,
 'censor_dims': False,
 'columns': {'species': {'private_id': False,
   'nullable': False,
   'max_partition_length': None,
   'max_influenced_partitions': None,
   'max_partition_contributions': None,
   'type': 'string',
   'cardinality': 3,
   'categories': ['Adelie', 'Chinstrap', 'Gentoo']},
  'island': {'private_id': False,
   'nullable': False,
   'max_partition_length': None,
   'max_influenced_partitions': None,
   'max_partition_contributions': None,
   'type': 'string',
   'cardinality': 3,
   'categories': ['Torgersen', 'Biscoe', 'Dream']},
  'bill_length_mm': {'private_id': False,
   'nullable': False,
   'max_partition_length': None,
   'max_influenced_partitions': None,
   'max_partition_contributions': None,
   'type': 'float',
   'precision': 64,
   'lower': 30.0,
   'upper': 65.0},
  'bill_depth_mm': {'private_id': False,
   'nullable': False,
   'max_partition_length': None,
   'max_influenced_partitions': None,
   'm

In [113]:
NB_PENGUINS = penguin_metadata["rows"]

df_dummy = client.get_dummy_dataset()

print(df_dummy.shape)
df_dummy.head()

(100, 7)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Dream,61.800324,20.774048,227.899635,3509.636957,FEMALE
1,Gentoo,Torgersen,54.48975,22.718264,163.455221,6592.209478,FEMALE
2,Chinstrap,Dream,39.305449,18.007412,203.606804,5906.470177,FEMALE
3,Chinstrap,Torgersen,63.921173,14.438975,201.422287,2552.942055,FEMALE
4,Chinstrap,Dream,57.256282,13.139363,235.757214,6985.173289,MALE


In [114]:
print(client.get_initial_budget())
print(client.get_total_spent_budget())
print(client.get_remaining_budget())

initial_epsilon=10.0 initial_delta=0.005
total_spent_epsilon=9.625367953470075 total_spent_delta=3.0000000000000004e-05
remaining_epsilon=0.3746320465299249 remaining_delta=0.0049700000000000005


In [115]:
num_rows_pipeline = (
    dp.t.make_split_dataframe(separator=",", col_names=columns) >>
    dp.t.make_select_column(key="species", TOA=str) >>
    dp.t.then_count() >>
    dp.m.then_laplace(scale=0.5) # scale arbitrary
)

num_rows = client.opendp.query(
    opendp_pipeline = num_rows_pipeline,
    dummy = True
).result.value

num_rows

99

In [116]:
columns = list(penguin_metadata["columns"].keys())
bill_length_min = penguin_metadata['columns']['bill_length_mm']['lower']
bill_length_max = penguin_metadata['columns']['bill_length_mm']['upper']
bill_length_min, bill_length_max

bill_length_pipeline = (
    dp.t.make_split_dataframe(separator=",", col_names=columns) >>
    dp.t.make_select_column(key="bill_length_mm", TOA=str) >>
    dp.t.then_cast_default(TOA=float) >>
    dp.t.then_clamp(bounds=(bill_length_min, bill_length_max)) >>
    dp.t.then_resize(size=NB_PENGUINS, constant=40.0) >>
    dp.t.then_mean() >>
    dp.m.then_gaussian(scale=0.1) # Noise addition mechanism instructions
)

In [117]:
cost_res = client.opendp.cost(
    opendp_pipeline = bill_length_pipeline,
    fixed_delta=1e-5
)
cost_res

CostResponse(epsilon=4.823307357702479, delta=1e-05)

In [118]:
dummy_var_res = client.opendp.query(
    opendp_pipeline = bill_length_pipeline,
    dummy=True,
    fixed_delta=1e-5
)
print(f"Dummy result for mean: {np.round(dummy_var_res.result.value, 2)}")

Dummy result for mean: 42.35


In [119]:
var_res = client.opendp.query(
    opendp_pipeline = bill_length_pipeline,
    fixed_delta=1e-5
)

var_bill_length = np.round(var_res.result.value, 2)
print(f"Variance of bill length: {var_bill_length} (from opendp query).")

ValidationError: 1 validation error for tagged-union[InvalidQueryExceptionModel,ExternalLibraryExceptionModel,UnauthorizedAccessExceptionModel,InternalServerExceptionModel]
  JSON input should be string, bytes or bytearray [type=json_type, input_value={'type': 'InvalidQueryExc...0.0049700000000000005.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/json_type

In [None]:
# Get standard error
standard_error = np.sqrt(var_bill_length / NB_PENGUINS)
print(f"Standard error of bill length: {np.round(standard_error, 2)}.")

# Compute the 95% confidence interval
ZSCORE = 1.96
lower_bound = np.round(40.0 - ZSCORE * standard_error, 2)
upper_bound = np.round(40.0 + ZSCORE * standard_error, 2)
print(f"The 95% confidence interval of the bill length of all penguins is [{lower_bound}, {upper_bound}].")

Standard error of bill length: 0.35.
The 95% confidence interval of the bill length of all penguins is [39.31, 40.69].


In [None]:
previous_queries = client.get_previous_queries()
len(previous_queries)

5

In [None]:
# OpenDP
var_bill_length_query = previous_queries[0]
var_bill_length_query

{'user_name': 'Dr. Antartica',
 'dataset_name': 'PENGUIN',
 'dp_librairy': 'opendp',
 'client_input': {'dataset_name': 'PENGUIN',
  'opendp_json': Measurement(
      input_domain   = AtomDomain(T=String),
      input_metric   = SymmetricDistance(),
      output_measure = MaxDivergence(f64)),
  'fixed_delta': None},
 'response': {'epsilon': 0.7122093023265228,
  'delta': 0.0,
  'requested_by': 'Dr. Antartica',
  'result': {'res_type': 'opendp', 'value': 21.587582284159083}},
 'timestamp': 1744791577.182293}

In [None]:
categories = penguin_metadata['columns']['species']['categories']
categories

['Adelie', 'Chinstrap', 'Gentoo']

In [None]:
species_count_pipeline = (
    dp.t.make_split_dataframe(separator=",", col_names=columns) >>
    dp.t.make_select_column(key="species", TOA=str) >>
    dp.t.then_count_by_categories(categories=categories) >>
    dp.m.then_laplace(scale=0.5)
)

dummy_res = client.opendp.query(
    opendp_pipeline = species_count_pipeline, 
    dummy=True
)

for i, count in enumerate(dummy_res.result.value):
    if i == len(categories):
        print(f"Species Unknown has {count} penguins.")
    else:
        print(f"Species {categories[i]} has {count} penguins.")

Dummy result for histogram: [27, 33, 41, 0]


In [None]:
cost_res = client.opendp.cost(
    opendp_pipeline = species_count_pipeline
)
cost_res

CostResponse(epsilon=2.0, delta=0.0)

In [None]:
# Import library
import numpy as np
import pandas as pd
import polars as pl
import opendp.prelude as dp

USER_NAME = "Dr. FSO"
DATASET_NAME = "FSO_INCOME_SYNTHETIC"
client = Client(url=APP_URL, user_name = USER_NAME, dataset_name = DATASET_NAME)

fso_income_metadata = client.get_dataset_metadata()
fso_income_metadata

{'max_ids': 1,
 'rows': 2032543,
 'row_privacy': True,
 'censor_dims': False,
 'columns': {'region': {'private_id': False,
   'nullable': False,
   'max_partition_length': None,
   'max_influenced_partitions': None,
   'max_partition_contributions': None,
   'type': 'int',
   'precision': 32,
   'cardinality': 7,
   'categories': [1, 2, 3, 4, 5, 6, 7]},
  'eco_branch': {'private_id': False,
   'nullable': False,
   'max_partition_length': None,
   'max_influenced_partitions': None,
   'max_partition_contributions': None,
   'type': 'int',
   'precision': 32,
   'cardinality': 72,
   'categories': [8,
    10,
    11,
    13,
    14,
    15,
    16,
    17,
    18,
    20,
    21,
    22,
    23,
    24,
    25,
    26,
    27,
    28,
    29,
    30,
    31,
    32,
    33,
    35,
    37,
    38,
    41,
    42,
    43,
    45,
    46,
    47,
    49,
    50,
    52,
    53,
    55,
    56,
    58,
    59,
    60,
    61,
    62,
    63,
    64,
    65,
    66,
    68,
    69,
    70,


In [122]:
context_margin = dp.Context.compositor(
    data=df_dummy,
    privacy_unit=dp.unit_of(contributions=1),
    privacy_loss=dp.loss_of(rho=0.1, delta=1e-7),
    split_evenly_over=5,
    margins={(): dp.polars.Margin(max_partition_length=5)}
)

AttributeError: module 'opendp.prelude' has no attribute 'polars'

In [None]:
# Income bounds
income_lower_bound, income_upper_bound = 1_000.0, 100_000.0

# Define dtype domain with bounds
lf_domain = dp.lazyframe_domain([
    dp.series_domain("region", dp.atom_domain(T=int)),
    dp.series_domain("eco_branch", dp.atom_domain(T=int)),
    dp.series_domain("profession", dp.atom_domain(T=int)),
    dp.series_domain("education", dp.atom_domain(T=int)),
    dp.series_domain("age", dp.atom_domain(T=int)),
    dp.series_domain("sex", dp.atom_domain(T=int)),
    dp.series_domain("income", dp.atom_domain(
        T=float,
        bounds=(income_lower_bound, income_upper_bound)
    ))
])

# Total
total_counts = pl.LazyFrame({
    "counts": [2_032_543]
}, schema_overrides={"counts": pl.UInt32})

# For sex
sex_counts = pl.LazyFrame({
    "sex": [0, 1], 
    "counts": [634_720, 1_397_823]
}, schema_overrides={"sex": pl.Int32, "counts": pl.UInt32})

# For region
region_counts = pl.LazyFrame({
    "region": [1, 2, 3, 4, 5, 6, 7],
    "counts": [352_001, 474_690, 267_304, 366_879, 284_638, 210_800, 76_231]
}, schema_overrides={"region": pl.Int32, "counts": pl.UInt32})

# For region and sex
sex_region_counts = pl.LazyFrame({
    "sex": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 
    "region": [1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7], 
    "counts": [113_367, 148_265, 83_326, 113_715, 87_668, 64_357, 24_022, 238_634, 326_425, 183_978, 253_164, 196_970, 146_443, 52_209]
}, schema_overrides={"sex": pl.Int32, "region": pl.Int32, "counts": pl.UInt32})

margins = {
    "total_counts": total_counts,
    "sex_counts": sex_counts,
    "region_counts": region_counts,
    "sex_region_counts": sex_region_counts
}

# Add counts to margin
lf_domain = lf_domain.with_counts(
    total_counts
).with_counts(sex_counts).with_counts(region_counts).with_counts(sex_region_counts)

AttributeError: 'Domain' object has no attribute 'with_counts'

In [101]:
# Prepare a list of candidates
candidates = [x * 250.0 for x in range(8, 52)]
print(candidates)

# Partitions
PARTITIONS = ['sex', 'region']

metric = dp.symmetric_distance()                                     # Input metric
expr_domain = dp.expr_domain(lf_domain, grouping_columns=PARTITIONS) # Expr domain (Groupby)
temperature = 1000.0        

def make_quantile_pipeline(quantile):
    # Create expression
    return (
        (dp.csv_domain(lf_domain), metric)
        >> dp.t.then_scan_csv()
        >> dp.t.then_groupby_stable(PARTITIONS)
        >> dp.m.then_private_agg(
            dp.c.make_basic_composition(
                [
                    (expr_domain, dp.l1(metric))
                    >> dp.t.then_col('income')
                    >> dp.m.then_private_quantile_expr(candidates, temperature, quantile)
                ]
            )
        )
        >> dp.t.make_collect(lf_domain, metric)
    )

q25 = make_quantile_pipeline(0.25)
q50 = make_quantile_pipeline(0.5)
q75 = make_quantile_pipeline(0.75)

[2000.0, 2250.0, 2500.0, 2750.0, 3000.0, 3250.0, 3500.0, 3750.0, 4000.0, 4250.0, 4500.0, 4750.0, 5000.0, 5250.0, 5500.0, 5750.0, 6000.0, 6250.0, 6500.0, 6750.0, 7000.0, 7250.0, 7500.0, 7750.0, 8000.0, 8250.0, 8500.0, 8750.0, 9000.0, 9250.0, 9500.0, 9750.0, 10000.0, 10250.0, 10500.0, 10750.0, 11000.0, 11250.0, 11500.0, 11750.0, 12000.0, 12250.0, 12500.0, 12750.0]


AttributeError: module 'opendp.prelude' has no attribute 'csv_domain'

In [None]:
dummy_r25 = client.opendp_query(
    opendp_pipeline = q25, 
    dummy=True,
    input_data_type="path"
)
dummy_r25



In [None]:
cost_q25 = client.estimate_opendp_cost(q25, input_data_type="path")
cost_q50 = client.estimate_opendp_cost(q50, input_data_type="path")
cost_q75 = client.estimate_opendp_cost(q75, input_data_type="path")

print(f"The estimated costs are respectively {cost_q25}, {cost_q50} and {cost_q75} for q25, q50 and q75")

In [None]:
r25 = client.opendp_query(q25, input_data_type="path")
r50 = client.opendp_query(q50, input_data_type="path")
r75 = client.opendp_query(q75, input_data_type="path")

In [None]:
r25 = r25["query_response"].to_pandas()
r50 = r50["query_response"].to_pandas()
r75 = r75["query_response"].to_pandas()

results = pd.merge(r25, r50, on=PARTITIONS, suffixes=('_25', '_50'))
results = pd.merge(results, r75, on=PARTITIONS)
results.sort_values(by = ['region', 'sex']).head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def quantile_data(q1, q2, q3):
    return np.concatenate((np.random.uniform(q1[0], q2[0], size=50), np.random.uniform(q2[0], q3[0], size=50)))

results['data'] = results.apply(
    lambda row: quantile_data(row["income_25"], row["income_50"], row["income"]),
    axis=1,
)
results['sex'] = results['sex'].replace({0: 'woman', 1: 'man'})
results['region'] = results['region'].replace({1: 'Lemanique', 2: 'Mittleland', 3: 'North-West', 4: 'Zürich', 5: 'Oriental', 6: 'Central', 7: 'Ticino'})
results = results.explode('data', ignore_index=True)

plt.figure(figsize=(10, 6))
sns.boxplot(x="region", y="data", hue="sex", data=results, palette="Set1", width=0.5);
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Regions', fontsize=15)
plt.ylabel('Income per month (in CHF)', fontsize=15)
plt.title('Income per partition of the population', fontsize=16)
plt.show()