In [2]:
from datetime import datetime
import numpy as np
import pandas as pd

# from utilities import *
import opendp.prelude as dp


In [3]:
dp.enable_features("contrib", "floating-point", "honest-but-curious")

# PUBLIC INFO
start_date, end_date = datetime(2020, 9, 1), datetime(2021, 3, 31)
time_col = "date"

# DATA
path = "C:\\Users\kshub\\OneDrive\\Documents\\PET_phase_2\\Technical_Phase_Data\\technical_phase_data.csv"
df = pd.read_csv(path)

  path = "C:\\Users\kshub\\OneDrive\\Documents\\PET_phase_2\\Technical_Phase_Data\\technical_phase_data.csv"


In [129]:
def dataframe_domain(public_key_sets=None):
    """Creates a domain representing the set of all data frames.
    
    Assumes column names and types are public information.
    Key sets optionally named for columns in `public_key_sets` are considered public information.

    Two data frames differing in their public information 
    are considered to have a data set distance of infinity.
    """
    return dp.user_domain(
        "DataFrameDomain", lambda x: isinstance(x, pd.DataFrame), public_key_sets
    )


def series_domain():
    """Creates a domain representing the set of all series.

    Assumes series name and type are public information.

    Two series differing in their public information 
    are considered to have a data set distance of infinity.
    """
    return dp.user_domain("SeriesDomain", lambda x: isinstance(x, pd.Series))

def identifier_distance():
    """Symmetric distance between the id sets."""
    return dp.user_distance("IdentifierDistance")


def approx_concentrated_divergence():
    """symmetric distance between the id sets"""
    return dp.user_distance("ApproxConcentratedDivergence()")


In [130]:
def make_truncate_time(start_date, end_date, time_col):
    """Create a transformation that filters the data to a given time frame.
    
    WARNING: Assumes that the data has at most one contribution per individual per week.
    """
    number_of_timesteps = (end_date - start_date).days // 7

    def time_preprocess(df):
        df = df.copy()

        # Convert time_col into datetime type
        df[time_col] = pd.to_datetime(df[time_col])

        # Filter the DataFrame based on the specified dates
        return df[(df[time_col] >= start_date) & (df[time_col] <= end_date)]

    return dp.t.make_user_transformation(
        input_domain=dataframe_domain(),
        input_metric=identifier_distance(),
        output_domain=dataframe_domain(),
        output_metric=dp.symmetric_distance(),
        function=time_preprocess,
        stability_map=lambda d_in: d_in * number_of_timesteps,
    )

In [131]:
def make_sum_by(column, by, bounds):
    """Create a transformation that computes the grouped bounded sum of `column`"""
    L, U = bounds
    # print(df.head())
    def function(df):
        df = df.copy()
        # print(df.head())
        df[column] = df[column].clip(*bounds)
        return df.groupby(by)[column].sum()

    return dp.t.make_user_transformation(
        input_domain=dataframe_domain(),
        input_metric=dp.symmetric_distance(),
        output_domain=series_domain(),
        output_metric=dp.l2_distance(T=float),
        function=function,
        stability_map=lambda d_in: np.sqrt(d_in) * max(abs(L), U),
    )


maximum nb_transaction entry for any category is 454. Assuming the bound of [0, 454]

In [132]:

def make_private_sum_by(column, by, bounds, scale):
    """Create a measurement that computes the grouped bounded sum of `column`"""
    space = dp.vector_domain(dp.atom_domain(T=int)), dp.l2_distance(T=float)
    m_gauss = space >> dp.m.then_gaussian(scale)
    t_sum = make_sum_by(column, by, bounds)

    def function(df):
        exact = t_sum(df)
        # print(exact)
        # print(exact.to_numpy())
        noisy_sum = pd.Series(
            np.maximum(m_gauss(exact.to_numpy().flatten()), 0), index=exact.index
        )
        return noisy_sum.to_frame()

    return dp.m.make_user_measurement(
        input_domain=dataframe_domain(public_key_sets=[by]),
        input_metric=dp.symmetric_distance(),
        output_measure=dp.zero_concentrated_divergence(T=float),
        function=function,
        privacy_map=lambda d_in: m_gauss.map(t_sum.map(d_in)),
    )

In [147]:
def make_filter_offline(column,entry):
        """filters offline entries"""
        return dp.t.make_user_transformation(
        input_domain=dataframe_domain(),
        input_metric=identifier_distance(),
        output_domain=dataframe_domain(),
        output_metric=identifier_distance(),
        function=lambda df: df[(df[column] == entry)],
        stability_map=lambda d_in: d_in,
    )

In [148]:
df_new = df.copy()
bounds = (0, 454)
start_date, end_date = datetime(2020, 9, 1), datetime(2021, 3, 31)
columns = ["nb_transactions"]
by = ["merch_postal_code", "date"]
scale=10.0
column="transaction_type"
entry="OFFLINE"
hotspot_predictor=(
    make_filter_offline(column,entry)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(columns, by, bounds, scale)
)
print(hotspot_predictor.map(1))
output=hotspot_predictor(df_new)
print(output)

30917.40000000001
                                 0
merch_postal_code date            
7071              2020-09-01  1185
                  2020-09-08  1582
                  2020-09-15  1775
                  2020-09-22  2161
                  2020-09-29  1268
...                            ...
9810000           2021-03-02  1960
                  2021-03-09  2680
                  2021-03-16  1780
                  2021-03-23  1970
                  2021-03-30  2891

[9393 rows x 1 columns]


In [140]:
scale=dp.binary_search_param(hotspot_predictor, 1,1.0,bounds=bounds)
print(scale)

OpenDPException: 
  FFI("Continued stack trace from Exception in user-defined function:
Traceback (most recent call last):
  File "c:\Users\kshub\AppData\Local\Programs\Python\Python312\Lib\site-packages\opendp\_convert.py", line 459, in wrapper_func
    py_out = func(py_arg)
             ^^^^^^^^^^^^
  File "C:\Users\kshub\AppData\Local\Temp\ipykernel_22960\2871615293.py", line 8, in <lambda>
    function=lambda df: df[(df["transaction_type"] == "OFFLINE")],
                            ~~^^^^^^^^^^^^^^^^^^^^
TypeError: 'int' object is not subscriptable
")

In [122]:
analysis=dp.binary_search_chain(hotspot_predictor, 1,1.0,bounds=bounds)

OpenDPException: 
  FFI("Continued stack trace from Exception in user-defined function:
Traceback (most recent call last):
  File "c:\Users\kshub\AppData\Local\Programs\Python\Python312\Lib\site-packages\opendp\_convert.py", line 459, in wrapper_func
    py_out = func(py_arg)
             ^^^^^^^^^^^^
  File "C:\Users\kshub\AppData\Local\Temp\ipykernel_22960\257734847.py", line 8, in <lambda>
    function=lambda df: df[(df["transaction_type"] == "OFFLINE")],
                            ~~^^^^^^^^^^^^^^^^^^^^
TypeError: 'int' object is not subscriptable
")

In [149]:
df_new = df.copy()
bounds = (0, 454)
start_date, end_date = datetime(2020, 9, 1), datetime(2021, 3, 31)
columns = ["nb_transactions"]
by = ["merch_postal_code"]
scale=10.0
column="transaction_type"
entry="OFFLINE"
hotspot_predictor=(
    make_filter_offline(column,entry)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(columns, by, bounds, scale)
)
print(hotspot_predictor.map(1))
output=hotspot_predictor(df_new)
print(output)

30917.40000000001
                        0
merch_postal_code        
7071                45765
55411              183692
70040               31856
70050               27510
70070               43102
...                   ...
9670000            108717
9710000            106582
9750000             84828
9790000            128256
9810000             83009

[303 rows x 1 columns]


In [150]:
df_new = df.copy()
bounds = (0, 454)
start_date, end_date = datetime(2020, 9, 1), datetime(2021, 3, 31)
columns = ["nb_transactions"]
by = ["merch_postal_code","date"]
scale=10.0
column="merch_category"
entry="Drug Stores/Pharmacies"
hotspot_predictor=(
    make_filter_offline(column,entry)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(columns, by, bounds, scale)
)
print(hotspot_predictor.map(1))
output=hotspot_predictor(df_new)
print(output)

30917.40000000001
                                 0
merch_postal_code date            
7071              2020-09-01  2014
                  2020-09-08  1946
                  2020-09-15  1923
                  2020-09-22  1940
                  2020-09-29  1896
...                            ...
9810000           2021-03-02  1836
                  2021-03-09  1851
                  2021-03-16  1899
                  2021-03-23  1907
                  2021-03-30  1923

[9238 rows x 1 columns]


In [151]:
df_new = df.copy()
bounds = (0, 454)
start_date, end_date = datetime(2020, 9, 1), datetime(2021, 3, 31)
columns = ["nb_transactions"]
by = ["merch_postal_code","date"]
scale=10.0
column="merch_category"
entry="Airlines"
hotspot_predictor=(
    make_filter_offline(column,entry)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(columns, by, bounds, scale)
)
print(hotspot_predictor.map(1))
output=hotspot_predictor(df_new)
print(output)

30917.40000000001
                               0
merch_postal_code date          
7071              2020-09-01   9
                  2020-09-08  25
                  2020-09-15  18
                  2020-09-22   0
                  2020-09-29   6
...                           ..
9810000           2021-03-02  58
                  2021-03-09  59
                  2021-03-16  71
                  2021-03-23  53
                  2021-03-30  65

[6291 rows x 1 columns]


In [None]:
def make_select_grouping_cols(candidates, min_bin_size, d_in, d_out):
    """Create a measurement that selects a set of grouping columns from `candidates`."""
    def make(s):
        return (
            make_grouping_cols_score(candidates, min_bin_size)
            >> dp.m.then_report_noisy_max_gumbel(s, optimize="max")
            >> (lambda idx: candidates[idx])
        )

    return dp.binary_search_chain(make, d_in, d_out, T=float)

In [28]:
def hotspot_detection(df, start_date, end_date, time_col):
    """DP function that detects hotspots."""
    df_new = df.copy()
    bounds = (0, 454)

    def make_filter_offline():
        """filters offline entries"""
        return dp.t.make_user_transformation(
        input_domain=dataframe_domain(),
        input_metric=dp.symmetric_distance(),
        output_domain=dp.vector_domain(dp.atom_domain(T=T)),
        output_metric=dp.symmetric_distance(),
        function=lambda df: df[(df["transaction_type"] == "OFFLINE")],
        stability_map=lambda d_in: d_in,
    )




    make_filter_offline>>make_truncate_time(start_date, end_date, time_col)>>make_private_sum_by()
    