# Ce notebook a pour but de montrer l'éventail des possibilités qu'offre OpenDP via la couche Context API

## Packages nécessaires

In [74]:
%pip install opendp
%pip install 'opendp[polars]'
%pip install hvplot

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Information au préalable sur le vecteur en entrée

In [75]:
import opendp.prelude as dp
from random import randint

dp.enable_features('contrib')


privacy_unit = dp.unit_of(contributions=1)
print("Privacy unit : ", privacy_unit)
# (SymmetricDistance(), 1)

privacy_loss = dp.loss_of(epsilon=1.)
print("Privacy loss :", privacy_loss)
# (MaxDivergence, 1.0)

# Informations publiques
bounds = (0.0, 100.0)
imputed_value = 50.0

data = [float(randint(0, 100)) for _ in range(100)]

context = dp.Context.compositor(
    data=data,
    privacy_unit=privacy_unit,
    privacy_loss=privacy_loss,
    split_evenly_over=3
)

Privacy unit :  (SymmetricDistance(), 1)
Privacy loss : (MaxDivergence, 1.0)


## Requête 1 : Comptage

In [76]:
count_query = (
    context.query()
    .count()
    .laplace()
)

scale = count_query.param()
print("Scale (count laplace) :", scale)
# 3.0000000000000004

accuracy = dp.discrete_laplacian_scale_to_accuracy(scale=scale, alpha=0.05)
print("Avec une proba de 95%, ecart avec la vrai valeur d'au plus :", accuracy)
# 9.445721638273584

dp_count = count_query.release()
print("Comptage bruité", dp_count)
confidence_interval = (dp_count - accuracy, dp_count + accuracy)
print("Intervalle de confiance :", confidence_interval)

Scale (count laplace) : 3.0000000000000004
Avec une proba de 95%, ecart avec la vrai valeur d'au plus : 9.445721638273584
Comptage bruité 100
Intervalle de confiance : (90.55427836172642, 109.44572163827358)


## Requête 2 : Moyenne

In [77]:
mean_query = (
    context.query()
    .impute_constant(imputed_value)
    .clamp(bounds)
    .resize(size=dp_count, constant=imputed_value)
    .mean()
    .laplace()
)

dp_mean = mean_query.release()

# Polars

In [78]:
import polars as pl
import opendp.prelude as dp

dp.enable_features("contrib")

context = dp.Context.compositor(
    # Many columns contain mixtures of strings and numbers and cannot be parsed as floats,
    # so we'll set `ignore_errors` to true to avoid conversion errors.
    data=pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True),
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=10,
    margins=[
        dp.polars.Margin(
            # the biggest (and only) partition is no larger than
            #    France population * number of quarters
            max_partition_length=60_000_000 * 36,
            public_info="lengths",# make partition size public (bounded-DP)
        ),
        dp.polars.Margin(by=["YEAR", "QUARTER"], public_info="keys"),
    ],
)

## Comptage

In [79]:
context_sans_marge = dp.Context.compositor(
    # Many columns contain mixtures of strings and numbers and cannot be parsed as floats,
    # so we'll set `ignore_errors` to true to avoid conversion errors.
    data=pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True),
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=10,
)

query_num_responses = context_sans_marge.query().select(dp.len())

print(query_num_responses.release().collect().item())
query_num_responses.summarize(alpha=0.05)

199810


column,aggregate,distribution,scale,accuracy
str,str,str,f64,f64
"""len""","""Frame Length""","""Integer Laplace""",360.0,1078.963271


## Somme

In [80]:
query_work_hours = (
    # 99 represents "Not applicable"
    context.query()
    .filter(pl.col("HWUSUAL") != 99.0)
    # compute the DP sum
    .select(
        pl.col.HWUSUAL
        .cast(int)
        .fill_null(35)
        .dp.sum(bounds=(0, 80)))
)

print(query_work_hours.release().collect())
query_work_hours.summarize(alpha=0.05)

shape: (1, 1)
┌─────────┐
│ HWUSUAL │
│ ---     │
│ i64     │
╞═════════╡
│ 3024146 │
└─────────┘


column,aggregate,distribution,scale,accuracy
str,str,str,f64,f64
"""HWUSUAL""","""Sum""","""Integer Laplace""",28800.0,86277.589474


## Moyenne

In [81]:
query_work_hours = (
    context.query()
    .filter(pl.col.HWUSUAL != 99.0)
    # release both the sum and length in one query
    .select(
        pl.col.HWUSUAL
        .cast(int)
        .fill_null(35)
        .dp.sum(bounds=(0, 80)), 
        
        dp.len())
)

print(query_work_hours.release().collect().with_columns(mean=pl.col.HWUSUAL / pl.col.len))
query_work_hours.summarize(alpha=0.05)

shape: (1, 3)
┌─────────┬───────┬───────────┐
│ HWUSUAL ┆ len   ┆ mean      │
│ ---     ┆ ---   ┆ ---       │
│ i64     ┆ u32   ┆ f64       │
╞═════════╪═══════╪═══════════╡
│ 3111571 ┆ 78399 ┆ 39.688912 │
└─────────┴───────┴───────────┘


column,aggregate,distribution,scale,accuracy
str,str,str,f64,f64
"""HWUSUAL""","""Sum""","""Integer Laplace""",57600.0,172554.678955
"""len""","""Frame Length""","""Integer Laplace""",720.0,2157.427063


In [82]:
# En utilisant l'information publique de la taille du dataset (len)
query_mean_work_hours = context.query().select(
    pl.col.HWUSUAL
    .cast(int)
    .fill_null(35)
    .dp.mean(bounds=(0, 80))
)

print(query_mean_work_hours.release().collect())
query_mean_work_hours.summarize(alpha=0.05)

shape: (1, 1)
┌───────────┐
│ HWUSUAL   │
│ ---       │
│ f64       │
╞═══════════╡
│ 63.076095 │
└───────────┘


column,aggregate,distribution,scale,accuracy
str,str,str,f64,f64
"""HWUSUAL""","""Sum""","""Integer Laplace""",14400.0,43139.04473
"""HWUSUAL""","""Length""","""Integer Laplace""",0.0,


## Quantile

In [83]:
candidates = list(range(20, 60))

query_multi_quantiles = (
    context.query()
    .filter(pl.col.HWUSUAL != 99.0)
    .select(
        pl.col.HWUSUAL
        .fill_null(35)
        .dp.quantile(a, candidates)
        .alias(f"{a}-Quantile")
        for a in [0.25, 0.5, 0.75]
    )
)

print(query_multi_quantiles.release().collect())
query_multi_quantiles.summarize()

shape: (1, 3)
┌───────────────┬──────────────┬───────────────┐
│ 0.25-Quantile ┆ 0.5-Quantile ┆ 0.75-Quantile │
│ ---           ┆ ---          ┆ ---           │
│ i64           ┆ i64          ┆ i64           │
╞═══════════════╪══════════════╪═══════════════╡
│ 35            ┆ 36           ┆ 44            │
└───────────────┴──────────────┴───────────────┘


column,aggregate,distribution,scale
str,str,str,f64
"""0.25-Quantile""","""0.25-Quantile""","""GumbelMin""",6480.0
"""0.5-Quantile""","""0.5-Quantile""","""GumbelMin""",2160.0
"""0.75-Quantile""","""0.75-Quantile""","""GumbelMin""",6480.0


## Grouper 

### Pas d'info sur les classes ? -> On ne publie qu'à partir d'un certain seuil

In [84]:
context = dp.Context.compositor(
    # Many columns contain mixtures of strings and numbers and cannot be parsed as floats,
    # so we'll set `ignore_errors` to true to avoid conversion errors.
    data=pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True),
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0 / 4, delta=1e-7),
    # allow for one query
    split_evenly_over=1,
)

query_age_ilostat = (
    context.query()
    .group_by("AGE", "ILOSTAT")
    .agg(dp.len())
)

query_age_ilostat.summarize(alpha=.05)

column,aggregate,distribution,scale,accuracy,threshold
str,str,str,f64,f64,u32
"""len""","""Frame Length""","""Integer Laplace""",144.0,431.884579,2773


In [85]:
import hvplot.pandas  # active hvplot sur pandas

# Récupération et conversion à Pandas directement
df = query_age_ilostat.release().collect().to_pandas()

# Ensuite, tu peux faire ton plotting
line = df.sort_values("AGE").hvplot.line(x="AGE", y="len", by="ILOSTAT")
scatter = df.sort_values("AGE").hvplot.scatter(x="AGE", y="len", by="ILOSTAT")
line * scatter

### Si on connait les classes, on peut éviter d'utiliser du delta

context = dp.Context.compositor(
    # Many columns contain mixtures of strings and numbers and cannot be parsed as floats,
    # so we'll set `ignore_errors` to true to avoid conversion errors.
    data=pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True),
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0 / 4),
    # allow for one query
    split_evenly_over=1,
)

query_age_ilostat = (
    context.query()
    .group_by("AGE", "ILOSTAT")
    .agg(dp.len())
    .with_keys(df["AGE", "ILOSTAT"])
)

query_age_ilostat.summarize()

query_age_ilostat = (
    context.query()
    .group_by("AGE", "ILOSTAT")
    .agg(dp.len())
    .join(df["AGE", "ILOSTAT"].lazy(), how="right", on=["AGE", "ILOSTAT"])
)

query_age_ilostat.summarize()

### Autre manière (j'ai l'impression)

In [88]:
context = dp.Context.compositor(
    data=pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True),
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0 / 4),
    split_evenly_over=2,
    margins=[
        # partition keys when grouped by "YEAR" and "QUARTER" are invariant
        dp.polars.Margin(by=["YEAR", "QUARTER"], public_info="keys")
    ],
)

query_quarterly_counts = (
    context.query()
    .group_by("YEAR", "QUARTER")
    .agg(dp.len())
)

print(query_quarterly_counts.release().collect())
query_quarterly_counts.summarize(alpha=.05)

shape: (40, 3)
┌──────┬─────────┬──────┐
│ YEAR ┆ QUARTER ┆ len  │
│ ---  ┆ ---     ┆ ---  │
│ i64  ┆ i64     ┆ u32  │
╞══════╪═════════╪══════╡
│ 2008 ┆ 4       ┆ 3831 │
│ 2006 ┆ 4       ┆ 4351 │
│ 2007 ┆ 4       ┆ 4230 │
│ 2012 ┆ 2       ┆ 6178 │
│ 2004 ┆ 2       ┆ 4027 │
│ …    ┆ …       ┆ …    │
│ 2013 ┆ 1       ┆ 6030 │
│ 2010 ┆ 1       ┆ 5786 │
│ 2009 ┆ 2       ┆ 5136 │
│ 2005 ┆ 4       ┆ 3876 │
│ 2006 ┆ 2       ┆ 5399 │
└──────┴─────────┴──────┘


column,aggregate,distribution,scale,accuracy
str,str,str,f64,f64
"""len""","""Frame Length""","""Integer Laplace""",288.0,863.270461


In [89]:
df = query_quarterly_counts.release().collect()

# build a date column
df = df.with_columns(pl.date(pl.col("YEAR"), pl.col("QUARTER") * 4, 1))

import hvplot.pandas  # Nécessaire pour activer .hvplot sur pandas

# df est toujours un polars.DataFrame ici
line = df.to_pandas().hvplot.line(x="date", y="len")

# Création du DataFrame avec la colonne "accuracy"
df_with_accuracy = df.with_columns(
    accuracy=query_quarterly_counts.summarize(alpha=.05)["accuracy"][0]
)

# Ensuite CONVERSION en pandas avant hvplot
errorbars = df_with_accuracy.to_pandas().hvplot.errorbars(
    x="date", y="len", yerr1="accuracy"
)

# Combinaison des deux tracés
(line * errorbars)


### L'information de la taille de la partition est connu

In [90]:
lf_preprocessed = pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True) \
    .filter(pl.col("HWUSUAL") < 99)

context = dp.Context.compositor(
    data=lf_preprocessed,
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0, delta=1e-7),
    split_evenly_over=1,
    margins=[
        # total number of responses when grouped by "SEX" is public information
        dp.polars.Margin(
            by=["SEX"],
            public_info="lengths",
            max_partition_length=60_000_000, # population of France
            max_num_partitions=1,
        )
    ],
)

query_work_hours = (
    context.query()
    .group_by("SEX")
    .agg(
        pl.col.HWUSUAL
        .cast(int)
        .fill_null(0)
        .dp.mean((0, 98))
    )
)

df = query_work_hours.release().collect()

# released dataframes from the OpenDP Library are shuffled to conceal the ordering of rows in the original dataset
# therefore, to ensure proper alignment, we use join instead of hstack to add labels
df = pl.DataFrame({"SEX": [1, 2], "SEX_STR": ["male", "female"]}).join(df, on="SEX")
print(df)
query_work_hours.summarize(alpha=.05)

shape: (2, 3)
┌─────┬─────────┬───────────┐
│ SEX ┆ SEX_STR ┆ HWUSUAL   │
│ --- ┆ ---     ┆ ---       │
│ i64 ┆ str     ┆ f64       │
╞═════╪═════════╪═══════════╡
│ 2   ┆ female  ┆ 34.156538 │
│ 1   ┆ male    ┆ 40.81166  │
└─────┴─────────┴───────────┘


column,aggregate,distribution,scale,accuracy
str,str,str,f64,f64
"""HWUSUAL""","""Sum""","""Integer Laplace""",1764.0,5284.97166
"""HWUSUAL""","""Length""","""Integer Laplace""",0.0,


## Processing

In [91]:
pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True).collect().head(5)

COEFF,QUARTER,REFYEAR,REFWEEK,INTWEEK,COUNTRY,DEGURBA,HHINST,INTWAVE,INTQUEST,REM,YEAR,HHPRIV,SEX,AGE,NATIONAL,YEARESID,COUNTRYB,PROXY,NOWKREAS,STAPRO,SIGNISAL,COUNTRYW,YSTARTWK,MSTARTWK,FTPT,TEMP,TEMPDUR,HWUSUAL,HWACTUAL,HWOVERP,HWOVERPU,HOURREAS,WISHMORE,HWWISH,LOOKOJ,EXIST2J,…,YEARPR,MONTHPR,STAPROPR,SEEKWORK,SEEKTYPE,SEEKDUR,METHODA,METHODB,METHODC,METHODD,METHODE,METHODF,METHODG,METHODH,METHODI,METHODJ,METHODK,METHODL,METHODM,WANTWORK,AVAILBLE,EDUCSTAT,EDUCLEVL,COURATT,COURLEN,ILOSTAT,ISCO1D,ISCOPR1D,DURUNE,EDUC4WN,HATLEV1D,STARTIME,LEAVCLAS,NACE1D,NACE2J1D,NACEPR1D,HHTYPE
f64,i64,i64,i64,f64,str,f64,f64,i64,i64,i64,i64,i64,i64,f64,str,f64,str,f64,i64,f64,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,…,f64,f64,f64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,f64,f64,f64,f64,i64,f64,f64,f64,f64,str,f64,f64,str,f64,str,i64
0.579834,1,2005,5,7.0,"""FR""",0.0,9.0,1,2,2,2005,1,1,47.0,"""000-OWN COUNTRY""",0.0,"""000-OWN COUNTRY""",2.0,99,0.0,9,"""000-OWN COUNTRY""",2003.0,3.0,1.0,9.0,9.0,60.0,60.0,99.0,99.0,97.0,,60.0,0,1.0,…,9999.0,99.0,9.0,9,9.0,9.0,9,9,9,9,9,9,9,9,9,9,9,9,9,9.0,9,2.0,9.0,2.0,999.0,1,,,9.0,0.0,"""H""",23.0,,,9.0,"""9.0""",1
0.375213,2,2013,21,22.0,"""FR""",0.0,9.0,2,2,5,2013,1,2,65.0,"""000-OWN COUNTRY""",0.0,"""000-OWN COUNTRY""",2.0,99,5.0,9,"""000-OWN COUNTRY""",1998.0,99.0,1.0,1.0,9.0,35.0,28.0,0.0,0.0,12.0,0.0,35.0,0,1.0,…,9999.0,99.0,9.0,9,9.0,9.0,9,9,9,9,9,9,9,9,9,9,9,9,9,9.0,9,2.0,9.0,2.0,999.0,1,200.0,999.0,9.0,0.0,"""H""",180.0,,"""H""",9.0,"""9""",1
0.602517,4,2004,40,41.0,"""FR""",0.0,9.0,4,2,9,2004,1,1,47.0,"""NO ANSWER""",,"""NO ANSWER""",2.0,99,5.0,9,"""000-OWN COUNTRY""",2002.0,4.0,1.0,1.0,9.0,35.0,35.0,,,97.0,0.0,35.0,0,1.0,…,9999.0,99.0,9.0,9,9.0,9.0,9,9,9,9,9,9,9,9,9,9,9,9,9,9.0,9,2.0,9.0,2.0,999.0,1,,,9.0,0.0,"""L""",29.0,,,9.0,"""9.0""",1
0.638932,1,2005,4,5.0,"""FR""",0.0,9.0,2,2,1,2005,1,1,20.0,"""000-OWN COUNTRY""",0.0,"""000-OWN COUNTRY""",2.0,99,5.0,9,"""000-OWN COUNTRY""",2003.0,8.0,1.0,1.0,9.0,35.0,35.0,,,97.0,0.0,35.0,0,1.0,…,9999.0,99.0,9.0,9,9.0,9.0,9,9,9,9,9,9,9,9,9,9,9,9,9,9.0,9,2.0,9.0,2.0,999.0,1,,,9.0,0.0,"""L""",17.0,,,9.0,"""9.0""",1
1.140249,4,2009,46,48.0,"""FR""",3.0,9.0,4,2,11,2009,1,1,7.0,"""000-OWN COUNTRY""",0.0,"""000-OWN COUNTRY""",9.0,99,9.0,9,"""NO ANSWER""",9999.0,99.0,9.0,9.0,9.0,99.0,99.0,99.0,99.0,99.0,9.0,99.0,9,9.0,…,9999.0,99.0,9.0,9,9.0,9.0,9,9,9,9,9,9,9,9,9,9,9,9,9,9.0,9,9.0,9.0,9.0,999.0,9,,,9.0,9.0,"""9""",999.0,,"""9""",9.0,"""9""",1


### Créer / Modifier une colonne

In [92]:
import polars as pl
import opendp.prelude as dp
dp.enable_features("contrib")

# Attention pas de margin possible pour la nouvelle colonne

context = dp.Context.compositor(
    # Many columns contain mixtures of strings and numbers and cannot be parsed as floats,
    # so we'll set `ignore_errors` to true to avoid conversion errors.
    data=pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True),
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0, delta=1e-7),
    split_evenly_over=4,
    margins=[dp.polars.Margin(max_partition_length=60_000_000 * 36)]
)

query_hwusual_binned = (
    context.query()
    # shadows the usual work hours "HWUSUAL" column with binned data
    .with_columns(pl.col.HWUSUAL.cut(breaks=[0, 20, 40, 60, 80, 98]))
    .group_by(pl.col.HWUSUAL)
    .agg(dp.len())
)
query_hwusual_binned.release().collect().sort("HWUSUAL")

HWUSUAL,len
cat,u32
"""(0, 20]""",5893
"""(20, 40]""",54169
"""(40, 60]""",16073
"""(98, inf]""",120194


### Filter

In [93]:
# Attention discard all public_info dans les marges

query_total_hours_worked = (
    context.query()
    .with_columns(pl.col.HWUSUAL.cast(int).fill_null(0))
    .filter(pl.col.HWUSUAL > 0)
    .select(pl.col.HWUSUAL.dp.sum((0, 80)))
)
query_total_hours_worked.release().collect()

HWUSUAL
i64
12554983


# Points supplémentaires

## Changer de Laplacien à gaussien

In [124]:
import polars as pl
import opendp.prelude as dp

dp.enable_features("contrib")

dp.loss_of(epsilon=1.0)
# (MaxDivergence, 1.0)

dp.loss_of(epsilon=1.0, delta=1e-9)
# (Approximate(MaxDivergence), (1.0, 1e-09))

dp.loss_of(rho=1.0)
# (ZeroConcentratedDivergence, 1.0)

context = dp.Context.compositor(
    data=pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True),
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(rho=0.1), # Changer la loss
    split_evenly_over=5,
)

query_num_responses = context.query().select(dp.len())
query_num_responses.summarize(alpha=0.05)

rho should be less than or equal to 0.5, and is typically less than or equal to 0.25


column,aggregate,distribution,scale,accuracy
str,str,str,f64,f64
"""len""","""Frame Length""","""Integer Gaussian""",180.0,354.0


## Petite différence pour le comptage mais pas très importante

In [113]:
context = dp.Context.compositor(
    data=pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True),
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=100.0),
    split_evenly_over=5,
)

query_len_variations = (
    context.query()
    .group_by("SEX")
    .agg([
        # total number of rows in the frame, including nulls
        dp.len(),
        # total number of rows in the HWUSUAL column (including nulls)
        pl.col.HWUSUAL.filter(pl.col.HWUSUAL > 100).dp.len(),
    ])
    # explicitly specifying keys makes the query satisfy pure-DP
    .with_keys(pl.LazyFrame({"SEX": [1, 2]}))
)
query_len_variations.summarize()

query_len_variations.release().collect()

epsilon should be less than or equal to 5, and is typically less than or equal to 1


SEX,len,HWUSUAL
i64,u32,u32
2,104332,0
1,95669,0


## Faire ses propres mécanismes ?

In [119]:
dp.enable_features("honest-but-curious")

def make_constant(input_domain, input_metric, constant):
    return dp.m.make_user_measurement(
        input_domain=input_domain,
        input_metric=input_metric,
        output_measure=dp.max_divergence(),
        function=lambda _: constant,
        privacy_map=lambda _: 0.0,
    )


dp.register(make_constant)

ValueError: 'constant' is already registered in the Context API. Please choose a different name.

In [125]:
context = dp.Context.compositor(
    data=[1, 2, 3],
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=2,
)

context.query().constant("denied").release()

context = dp.Context.compositor(
    data=pl.scan_csv(dp.examples.get_france_lfs_path(), ignore_errors=True),
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=5,
)

query_len_variations = (
    context.query()
    .group_by("SEX")
    .agg([
        # total number of rows in the frame, including nulls
        dp.len(),
        # total number of rows in the HWUSUAL column (including nulls)
        pl.col.HWUSUAL.filter(pl.col.HWUSUAL > 100).dp.len(),
        pl.col.HWUSUAL.filter(pl.col.HWUSUAL > 100).dp.constant("denied")
    ])
    # explicitly specifying keys makes the query satisfy pure-DP
    .with_keys(pl.LazyFrame({"SEX": [1, 2]}))
)
query_len_variations.summarize()

query_len_variations.release().collect()

AttributeError: 'DPExpr' object has no attribute 'constant'

## Les Marges

The Margin class is used to describe what information is known publicly about a grouped dataset: like the values you might expect to find in the margins of a table.

Be aware that aspects of your data marked as “public information” are not subject to privacy protections, so it is important that public descriptors about the margin should be set conservatively, or not set at all.


Parameters:

        by (Sequence | None) –

        public_info (Literal['keys'] | ~typing.Literal['lengths'] | None) –

        max_partition_length (int | None) –

        max_num_partitions (int | None) –

        max_partition_contributions (int | None) –

        max_influenced_partitions (int | None) –

by: Sequence | None = None

    Polars expressions describing the grouping columns.

max_influenced_partitions: int | None = None

    The greatest number of partitions any one individual can contribute to.

max_num_partitions: int | None = None

    An upper bound on the number of distinct partitions.

max_partition_contributions: int | None = None

    The greatest number of records an individual may contribute to any one partition.

    This can significantly reduce the sensitivity of grouped queries under zero-Concentrated DP.

max_partition_length: int | None = None

    An upper bound on the number of records in any one partition.

    If you don’t know how many records are in the data, you can specify a very loose upper bound, for example, the size of the total population you are sampling from.

    This is used to resolve issues raised in the paper Widespread Underestimation of Sensitivity in Differentially Private Libraries and How to Fix It.

public_info: Literal['keys'] | Literal['lengths'] | None = None

    Identifies properties of grouped data that are considered public information.

        "keys" designates that keys are not protected

        "lengths" designates that both keys and partition lengths are not protected

