In [1]:
from datetime import datetime
import numpy as np
import pandas as pd

# from utilities import *
import opendp.prelude as dp


In [2]:
dp.enable_features("contrib", "floating-point", "honest-but-curious")

# PUBLIC INFO
start_date, end_date = datetime(2020, 9, 1), datetime(2021, 3, 31)
# time_col = "date"

# DATA
path = "C:\\Users\kshub\\OneDrive\\Documents\\PET_phase_2\\Technical_Phase_Data\\technical_phase_data.csv"
df = pd.read_csv(path)

In [3]:
def dataframe_domain(public_key_sets=None):
    """Creates a domain representing the set of all data frames.
    
    Assumes column names and types are public information.
    Key sets optionally named for columns in `public_key_sets` are considered public information.

    Two data frames differing in their public information 
    are considered to have a data set distance of infinity.
    """
    return dp.user_domain(
        "DataFrameDomain", lambda x: isinstance(x, pd.DataFrame), public_key_sets
    )


def series_domain():
    """Creates a domain representing the set of all series.

    Assumes series name and type are public information.

    Two series differing in their public information 
    are considered to have a data set distance of infinity.
    """
    return dp.user_domain("SeriesDomain", lambda x: isinstance(x, pd.Series))

def identifier_distance():
    """Symmetric distance between the id sets."""
    return dp.user_distance("IdentifierDistance")


def approx_concentrated_divergence():
    """symmetric distance between the id sets"""
    return dp.user_distance("ApproxConcentratedDivergence()")


In [4]:
def make_preprocess_location():
    """Create a 1-stable transformation to bin `merch_postal_code` by city"""

    def categorize_city(code):
        if code.startswith("5"):
            return "Medellin"
        elif code.startswith("11"):
            return "Bogota"
        elif code.startswith("70"):
            return "Brasilia"
        else:
            return "Santiago"

    def location_preprocess(df):
        loc_df = df.copy()
        # Convert merchant_postal_code into str type
        loc_df["merch_postal_code"] = loc_df["merch_postal_code"].astype(str)
        # Apply the function to create a new column
        loc_df["city"] = loc_df["merch_postal_code"].apply(
            categorize_city
        )
        return loc_df

    return dp.t.make_user_transformation(
        input_domain=dataframe_domain(),
        input_metric=identifier_distance(),
        output_domain=dataframe_domain(),
        output_metric=identifier_distance(),
        function=location_preprocess,
        stability_map=lambda d_in: d_in,
    )

In [5]:
def make_preprocess_merchant():
    """Create a 1-stable transformation to bin `merch_postal_code` by city"""

    def categorize_merchant(merch):
        if merch in ['Hotels/Motels','Restaurants','Bars/Discotheques']:
            return "luxury"
        elif merch in ['Grocery Stores/Supermarkets','Drug Stores/Pharmacies','General Retail Stores','Utilities: Electric, Gas, Water','Hospitals']:
            return "essential"
        else:
            return "other"

    def merchant_preprocess(df):
        loc_df = df.copy()
        # Convert merchant_postal_code into str type
        loc_df["merch_category"] = loc_df["merch_category"].astype(str)
        # Apply the function to create a new column
        loc_df["merch_super_category"] = loc_df["merch_category"].apply(
            categorize_merchant
        )
        return loc_df

    return dp.t.make_user_transformation(
        input_domain=dataframe_domain(),
        input_metric=identifier_distance(),
        output_domain=dataframe_domain(),
        output_metric=identifier_distance(),
        function=merchant_preprocess,
        stability_map=lambda d_in: d_in,
    )

In [6]:
def make_truncate_time(start_date, end_date, time_col):
    """Create a transformation that filters the data to a given time frame.
    
    WARNING: Assumes that the data has at most one contribution per individual per week.
    """
    number_of_timesteps = (end_date - start_date).days // 7

    def time_preprocess(df):
        df = df.copy()

        # Convert time_col into datetime type
        df[time_col] = pd.to_datetime(df[time_col])

        # Filter the DataFrame based on the specified dates
        return df[(df[time_col] >= start_date) & (df[time_col] <= end_date)]

    return dp.t.make_user_transformation(
        input_domain=dataframe_domain(),
        input_metric=identifier_distance(),
        output_domain=dataframe_domain(),
        output_metric=dp.symmetric_distance(),
        function=time_preprocess,
        stability_map=lambda d_in: d_in * number_of_timesteps,
    )

In [7]:
def make_sum_by(column, by, bounds):
    """Create a transformation that computes the grouped bounded sum of `column`"""
    L, U = bounds
    def function(df):
        df = df.copy()
        df[column] = df[column].clip(*bounds)
        return df.groupby(by)[column].sum()

    return dp.t.make_user_transformation(
        input_domain=dataframe_domain(),
        input_metric=dp.symmetric_distance(),
        output_domain=series_domain(),
        output_metric=dp.l2_distance(T=float),
        function=function,
        stability_map=lambda d_in: np.sqrt(d_in) * max(abs(L), U),
    )


maximum nb_transaction entry for any category is 454. Assuming the bound of [0, 454]

In [8]:

def make_private_sum_by(column, by, bounds, scale):
    """Create a measurement that computes the grouped bounded sum of `column`"""
    space = dp.vector_domain(dp.atom_domain(T=int)), dp.l2_distance(T=float)
    m_gauss = space >> dp.m.then_gaussian(scale)
    t_sum = make_sum_by(column, by, bounds)

    def function(df):
        exact = t_sum(df)
        # print(exact)
        noisy_sum = pd.Series(
            np.maximum(m_gauss(exact.to_numpy().flatten()), 0), 
        )
        # print(noisy_sum)
        noisy_sum=noisy_sum.to_frame(name=column)
        noisy_sum[by] = exact.index
        return noisy_sum

    return dp.m.make_user_measurement(
        input_domain=dataframe_domain(public_key_sets=[by]),
        input_metric=dp.symmetric_distance(),
        output_measure=dp.zero_concentrated_divergence(T=float),
        function=function,
        privacy_map=lambda d_in: m_gauss.map(t_sum.map(d_in)),
    )

In [9]:
def make_filter(column,entry, sensetivity:int= 1):
        """filters offline entries"""
        
        def function(df):
            df = df.copy()
            return df[(df[column] == entry)]


        return dp.t.make_user_transformation(
        input_domain=dataframe_domain(),
        input_metric=identifier_distance(),
        output_domain=dataframe_domain(),
        output_metric=identifier_distance(),
        function=function,
        stability_map=lambda d_in: d_in* sensetivity,
    )

In [10]:
print((end_date-start_date)/7)
print(np.sqrt(30)*454)
space = dp.vector_domain(dp.atom_domain(T=int)), dp.l2_distance(T=float)
m_gauss = space >> dp.m.then_gaussian(10.0)
print(m_gauss.map(2486.66))

30 days, 3:25:42.857143
2486.6604110734543
30917.389778


In [11]:
df_new = df.copy()
bounds = (0, 454)
start_date, end_date = datetime(2020, 9, 1), datetime(2021, 3, 31)
columns = "nb_transactions"
by = "merch_postal_code"
scale=10.0
column="transaction_type"
entry="OFFLINE"
city_col="city"
City_entry="Medellin"
time_col="date"
hotspot_predictor=(
    make_preprocess_location()
    >>make_filter(column,entry)
    >>make_filter(city_col,City_entry)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(columns, by, bounds, scale)
)
print(hotspot_predictor.map(1))
output=hotspot_predictor(df_new)
output

30917.40000000001


Unnamed: 0,nb_transactions,merch_postal_code
0,182274,500001
1,184205,500002
2,181059,500003
3,178539,500004
4,202203,500005
5,189753,500006
6,205051,500007
7,166501,500008
8,194858,500009
9,188944,500010


In [12]:
postal_codes=df["merch_postal_code"].unique()
print(postal_codes)

[8700000  500034  110621 8900000  111941  111111  111061 8780000  111321
   70675 9460000   70374   70253 9080000  110231   70355 9650000  110321
  111961   70336  500030  500020   70354  110911 9020000 9420000   70640
  500008  500042   70345 9250000 9380000  111121   70353  500031   70710
  111411  110921 9540000  500043  500044  110811  110211  110111   70387
  110561  110841   70277  111071  111921 8420000  110011  111166 9560000
  500024  110141 9480000  500046  112041 8380000  500007  500013  111621
   70070  111981  500033   70704  110931   70256  500001  111911 8940000
 9630000  110431  111011   70302   55411  110821 8980000  111156   70301
  110881  111221  111151 9200000 8580000  500023  500036   70077  500014
  500035  110861   70650  500010  110311 7910000  500040   70293   70385
  111311  500025 9790000  111831  110851  111631  500012   70313  111051
  500022  110441 9670000  110541   70210  110121   70343   70670  111821
  111031   70272  500047  500002  110010  112021   

In [13]:
import random

postal_codes=df["merch_postal_code"].unique()

postal_code = {
    'Medellin': [ code  for code in postal_codes if str(code).startswith('5')],
    'Bogota': [ code  for code in postal_codes if str(code).startswith('11')],
    'Brasilia': [ code  for code in postal_codes if str(code).startswith('70')],
    'Santiago': [ code  for code in postal_codes if not str(code).startswith('5') and not str(code).startswith('11') and not str(code).startswith('70')]
}

# Reference coordinates
reference_coords = {
    "Medellin": (6.2476, -75.5658),
    "Bogota": (4.7110, -74.0721),
    "Brasilia": (-15.7975, -47.8919),
    "Santiago": (-33.4489, -70.6693)
}

# Function to generate unique coordinates
def generate_unique_coords(base_lat, base_lon, num_coords):
    coords = []
    for _ in range(num_coords):
        # Slightly vary the base coordinates
        lat_variation = random.uniform(-1.5, +1.5)
        lon_variation = random.uniform(-1.5, +1.5)
        new_lat = base_lat + lat_variation
        new_lon = base_lon + lon_variation
        coords.append((new_lat, new_lon))
    return coords

# Assign unique coordinates to each postal code
postal_code_coords = {}
for segment, codes in postal_code.items():
    base_lat, base_lon = reference_coords[segment]
    unique_coords = generate_unique_coords(base_lat, base_lon, len(codes))
    for code, coord in zip(codes, unique_coords):
        postal_code_coords[code] = coord

# Print the results
for code, coord in list(postal_code_coords.items())[:10]:

    print(f'{code}: {coord}')

500034: (5.8790718661717545, -75.52975986919401)
500030: (7.5181870194686296, -74.4469053690082)
500020: (7.194115163531915, -75.39875048300307)
500008: (4.859658283922736, -76.67225472652149)
500042: (6.88110934450476, -74.89991564531226)
500031: (5.116301381344213, -74.93706678389289)
500043: (6.825553086973038, -76.74514417604007)
500044: (6.597780500772329, -76.98966454423095)
500024: (5.6459930606929944, -75.92399169353925)
500046: (7.237907678910985, -75.37405076421844)


In [14]:
df_cords = df.copy()
df_cords['Latitude'], df_cords['Longitude'] = zip(*df['merch_postal_code'].map(postal_code_coords))

df_cords.head()

Unnamed: 0,ID,date,merch_category,merch_postal_code,transaction_type,spendamt,nb_transactions,Latitude,Longitude
0,1,2019-01-01,Grocery Stores/Supermarkets,8700000,ONLINE,11238.12845,160,-33.142715,-71.400178
1,1,2019-01-01,Grocery Stores/Supermarkets,500034,ONLINE,12848.165221,183,5.879072,-75.52976
2,2,2019-01-01,Grocery Stores/Supermarkets,110621,ONLINE,12116.165569,173,5.547468,-74.780894
3,3,2019-01-01,Hotels/Motels,8900000,OFFLINE,7745.998879,38,-33.414208,-70.589038
4,4,2019-01-01,Restaurants,111941,OFFLINE,6927.424754,173,3.84968,-73.059453


In [15]:
df_week = df_cords[df_cords['date'] == '2019-01-01']

df_week.head()

Unnamed: 0,ID,date,merch_category,merch_postal_code,transaction_type,spendamt,nb_transactions,Latitude,Longitude
0,1,2019-01-01,Grocery Stores/Supermarkets,8700000,ONLINE,11238.12845,160,-33.142715,-71.400178
1,1,2019-01-01,Grocery Stores/Supermarkets,500034,ONLINE,12848.165221,183,5.879072,-75.52976
2,2,2019-01-01,Grocery Stores/Supermarkets,110621,ONLINE,12116.165569,173,5.547468,-74.780894
3,3,2019-01-01,Hotels/Motels,8900000,OFFLINE,7745.998879,38,-33.414208,-70.589038
4,4,2019-01-01,Restaurants,111941,OFFLINE,6927.424754,173,3.84968,-73.059453


In [16]:
import plotly.express as px

# Improved scatter_geo plot
fig = px.scatter_geo(
    df_week,
    lat='Latitude',
    lon='Longitude',
    color='nb_transactions',
    size='spendamt',
    hover_name='merch_category',
    hover_data={'spendamt': True, 'nb_transactions': True, 'Latitude': False, 'Longitude': False},
    projection='mercator',  # Mercator projection is suitable for South America
    fitbounds="locations",
    title="Transaction Locations in South America",
    color_continuous_scale=px.colors.sequential.Plasma  # Better color scale for visual distinction
)

# Center the map around South America
fig.update_geos(
    center=dict(lat=-15.0, lon=-60.0),
    projection_scale=2.5  # Adjust the scale to zoom in on South America
)

# Display the plot
fig.show()

In [None]:
import plotly.express as px
import pandas as pd
import ipywidgets as widgets
from IPython.display import display


# Create a date slider
date_slider = widgets.SelectionSlider(
    options=df_cords['date'],
    description='Date:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True
)

# Function to update the plot based on the selected date
def update_plot(date):
    print(date)
    filtered_df = df_cords[df_cords['date'] == date]
    fig = px.scatter_geo(
        filtered_df,
        lat='Latitude',
        lon='Longitude',
        color='nb_transactions',
        size='spendamt',
        hover_name='merch_category',
        hover_data={'spendamt': True, 'nb_transactions': True, 'Latitude': False, 'Longitude': False},
        projection='mercator',
        title=f"Transaction Locations on {date}",
        color_continuous_scale=px.colors.sequential.Plasma
    )
    fig.update_geos(
        center=dict(lat=-15.0, lon=-60.0),
        projection_scale=2.5
    )
    fig.show()

# Display the slider and the initial plot
widgets.interactive(update_plot, date=date_slider)
display(date_slider)
# update_plot(df_cords['date'].min())

In [16]:
# give a unique number to each category
category_numbers = {category: i for i, category in enumerate(df_cords['merch_category'].unique())}

category_numbers

{'Grocery Stores/Supermarkets': 0,
 'Hotels/Motels': 1,
 'Restaurants': 2,
 'General Retail Stores': 3,
 'Drug Stores/Pharmacies': 4,
 'Utilities: Electric, Gas, Water': 5,
 'Hospitals': 6,
 'Bars/Discotheques': 7,
 'Computer Network/Information Services': 8,
 'Airlines': 9}

In [17]:
df_cords['category_number'] = df_cords['merch_category'].map(category_numbers)

df_cords.head()

Unnamed: 0,ID,date,merch_category,merch_postal_code,transaction_type,spendamt,nb_transactions,Latitude,Longitude,category_number
0,1,2019-01-01,Grocery Stores/Supermarkets,8700000,ONLINE,11238.12845,160,-34.189865,-69.195871,0
1,1,2019-01-01,Grocery Stores/Supermarkets,500034,ONLINE,12848.165221,183,5.036609,-75.963584,0
2,2,2019-01-01,Grocery Stores/Supermarkets,110621,ONLINE,12116.165569,173,5.178435,-73.24015,0
3,3,2019-01-01,Hotels/Motels,8900000,OFFLINE,7745.998879,38,-33.704478,-72.073855,1
4,4,2019-01-01,Restaurants,111941,OFFLINE,6927.424754,173,3.816293,-74.4846,2


In [18]:
import plotly.graph_objects as go
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

# Create a date slider
date_slider = widgets.SelectionSlider(
    options=df_cords['date'],
    description='Date:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True
)

# Initial plot
initial_date = df_cords['date'][0]
filtered_df = df_cords[(df['date'] == initial_date) & (df_cords['merch_postal_code'] >7500000)]

fig = go.FigureWidget()

scatter = go.Scattergeo(
    lat=filtered_df['Latitude'],
    lon=filtered_df['Longitude'],
    text=filtered_df['spendamt'],
    marker=dict(
        size=filtered_df['spendamt'] / 100,  # Adjust size for better visualization
        color=filtered_df['category_number'],
        colorscale='Viridis',
        colorbar=dict(title='category_number'),
        line_color='black',
        line_width=0.5,
        sizemode='area'
    ),
    hoverinfo='text'
)

fig.add_trace(scatter)

fig.update_layout(
    title_text=f'Transaction Locations on {initial_date}',
    geo=dict(
        # focus only on points 
        scope = 'world',
        projection_type='mercator',
        showland=True,
        landcolor='rgb(217, 217, 217)',
        subunitwidth=1,
        countrywidth=1,
        lonaxis=dict(range=[filtered_df['Longitude'].min() - 1, filtered_df['Longitude'].max() + 1]),
        lataxis=dict(range=[filtered_df['Latitude'].min() - 1, filtered_df['Latitude'].max() + 1])
    )
)

# Function to update the plot based on the selected date
def update_plot(date):
    filtered_df = df_cords[(df_cords['date'] == date) & (df_cords['merch_postal_code'] >7500000 )]
    with fig.batch_update():
        fig.data[0].marker.size = filtered_df['spendamt'] / 100
        fig.data[0].marker.color = filtered_df['category_number']
        fig.data[0].text = filtered_df['spendamt']
        fig.layout.title.text = f'Transaction Locations on {date}'

# Display the slider and the initial plot
widgets.interactive(update_plot, date=date_slider)
display(date_slider)
display(fig)

SelectionSlider(continuous_update=False, description='Date:', options=('2019-01-01', '2019-01-01', '2019-01-01…

FigureWidget({
    'data': [{'hoverinfo': 'text',
              'lat': array([-34.1898648 , -33.7044778 , -32.16629612, ..., -33.57704524,
                            -32.43968967, -34.93283588]),
              'lon': array([-69.19587129, -72.0738547 , -70.60934886, ..., -70.06573084,
                            -70.22920016, -72.002195  ]),
              'marker': {'color': array([0, 1, 2, ..., 3, 2, 2]),
                         'colorbar': {'title': {'text': 'category_number'}},
                         'colorscale': [[0.0, '#440154'], [0.1111111111111111,
                                        '#482878'], [0.2222222222222222,
                                        '#3e4989'], [0.3333333333333333,
                                        '#31688e'], [0.4444444444444444,
                                        '#26828e'], [0.5555555555555556,
                                        '#1f9e89'], [0.6666666666666666,
                                        '#35b779'], [0.77777777777777

In [22]:
def hotspot_analyser(df:pd.DataFrame, start_date:datetime,end_date:datetime,city_filter:str, nb_postal_codes: int,epsilon:float):
    """final function to predict hotspots"""
    bounds = (0, 600)
    transaction_data_col = "nb_transactions"
    postal_code_groupby_col = "merch_postal_code"
    transaction_type_col = "transaction_type"
    transaction_type_filter = "OFFLINE"
    city_col="city"
    time_col="date"

    """time steps calculation"""
    nb_timesteps = (end_date - start_date).days // 7

    """scale calculation"""
    scale=(3.0*nb_postal_codes*nb_timesteps)/epsilon

    new_df=df.copy()


    hotspot_predictor=(
    make_preprocess_location()
    >>make_filter(transaction_type_col,transaction_type_filter)
    >>make_filter(city_col,city_filter,nb_postal_codes)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(transaction_data_col, postal_code_groupby_col, bounds, scale)
   )

    return hotspot_predictor(new_df)
    

In [23]:
print(hotspot_analyser(df,start_date,end_date,"Medellin",42,10))

    nb_transactions merch_postal_code
0            182596            500001
1            183885            500002
2            181591            500003
3            179414            500004
4            202129            500005
5            189668            500006
6            206036            500007
7            167154            500008
8            194797            500009
9            188914            500010
10           175138            500011
11           165858            500012
12           190372            500013
13           214984            500014
14           192055            500015
15           192691            500016
16           193748            500017
17           217817            500020
18           204687            500021
19           214916            500022
20           186394            500023
21           213222            500024
22           206241            500025
23           184740            500026
24           202180            500027
25          

In [24]:
def mobility_analyser(df:pd.DataFrame,start_date:datetime,end_date:datetime,city_filter: str, epsilon:float):
    """final function to predict hotspots"""
    bounds = (0, 600)
    transaction_data_col = "nb_transactions"
    groupby_col = "date"

    city_col="city"
    time_col="date"
    merch_category_col="merch_category"
    merch_filter="Airlines"

    """time steps calculation"""
    nb_timesteps = (end_date - start_date).days // 7

    """scale calculation"""
    scale=(3.0*nb_timesteps*nb_timesteps)/epsilon

    new_df=df.copy()


    hotspot_predictor=(
    make_preprocess_location()
    >>make_filter(city_col,city_filter)
    >>make_filter(merch_category_col,merch_filter)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(transaction_data_col, groupby_col, bounds, scale)
   )

    return hotspot_predictor(new_df)

In [25]:
print(mobility_analyser(df,start_date,end_date,"Medellin",10))

    nb_transactions       date
0              1212 2020-09-01
1              1521 2020-09-08
2              1492 2020-09-15
3               884 2020-09-22
4              1714 2020-09-29
5              1210 2020-10-06
6              1387 2020-10-13
7              1691 2020-10-20
8              1177 2020-10-27
9               899 2020-11-03
10             1731 2020-11-10
11             1448 2020-11-17
12             1795 2020-11-24
13             1238 2020-12-01
14             1190 2020-12-08
15             1238 2020-12-15
16             1199 2020-12-22
17             1098 2020-12-29
18             1232 2021-01-05
19             1095 2021-01-12
20             1502 2021-01-19
21             1032 2021-01-26
22             1199 2021-02-02
23             1509 2021-02-09
24             1253 2021-02-16
25             1472 2021-02-23
26              875 2021-03-02
27             1095 2021-03-09
28             1506 2021-03-16
29              847 2021-03-23
30             1763 2021-03-30


In [29]:
def pandemic_stage_analyser(df:pd.DataFrame,start_date:datetime,end_date:datetime,city_filter: str,essential_or_luxury:str, epsilon:float):
    """final function to predict hotspots"""
    bounds = (0, 600)
    transaction_data_col = "nb_transactions"
    groupby_col = "date"

    city_col="city"
    time_col="date"
    merch_category_col="merch_super_category"


    """time steps calculation"""
    nb_timesteps = (end_date - start_date).days // 7

    """scale calculation"""
    scale=(3.0*nb_timesteps*nb_timesteps)/epsilon

    new_df=df.copy()


    hotspot_predictor=(
    make_preprocess_location()
    >>make_preprocess_merchant()
    >>make_filter(city_col,city_filter)
    >>make_filter(merch_category_col,essential_or_luxury)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(transaction_data_col, groupby_col, bounds, scale)
   )

    return hotspot_predictor(new_df)

In [31]:
print(pandemic_stage_analyser(df,start_date,end_date,"Medellin",essential_or_luxury="luxury",epsilon=10))

    nb_transactions       date
0            119130 2020-09-01
1            122774 2020-09-08
2            117968 2020-09-15
3            123467 2020-09-22
4            115438 2020-09-29
5            118825 2020-10-06
6            117688 2020-10-13
7            118011 2020-10-20
8            116966 2020-10-27
9            117464 2020-11-03
10           124120 2020-11-10
11           115325 2020-11-17
12           117164 2020-11-24
13           117696 2020-12-01
14           111239 2020-12-08
15           107733 2020-12-15
16           114521 2020-12-22
17           107327 2020-12-29
18           105938 2021-01-05
19           104889 2021-01-12
20           110024 2021-01-19
21           115289 2021-01-26
22           122166 2021-02-02
23           126110 2021-02-09
24           127413 2021-02-16
25           132162 2021-02-23
26           133530 2021-03-02
27           126731 2021-03-09
28           122698 2021-03-16
29           117623 2021-03-23
30           112777 2021-03-30


In [157]:
scale=dp.binary_search_param(hotspot_predictor, 1,1.0,bounds=bounds)
print(scale)

OpenDPException: 
  FFI("Continued stack trace from Exception in user-defined function:
Traceback (most recent call last):
  File "c:\Users\kshub\AppData\Local\Programs\Python\Python312\Lib\site-packages\opendp\_convert.py", line 459, in wrapper_func
    py_out = func(py_arg)
             ^^^^^^^^^^^^
  File "C:\Users\kshub\AppData\Local\Temp\ipykernel_22960\4272251858.py", line 5, in function
    df = df.copy()
         ^^^^^^^
AttributeError: 'int' object has no attribute 'copy'
")

In [122]:
analysis=dp.binary_search_chain(hotspot_predictor, 1,1.0,bounds=bounds)

OpenDPException: 
  FFI("Continued stack trace from Exception in user-defined function:
Traceback (most recent call last):
  File "c:\Users\kshub\AppData\Local\Programs\Python\Python312\Lib\site-packages\opendp\_convert.py", line 459, in wrapper_func
    py_out = func(py_arg)
             ^^^^^^^^^^^^
  File "C:\Users\kshub\AppData\Local\Temp\ipykernel_22960\257734847.py", line 8, in <lambda>
    function=lambda df: df[(df["transaction_type"] == "OFFLINE")],
                            ~~^^^^^^^^^^^^^^^^^^^^
TypeError: 'int' object is not subscriptable
")

In [149]:
df_new = df.copy()
bounds = (0, 454)
start_date, end_date = datetime(2020, 9, 1), datetime(2021, 3, 31)
columns = ["nb_transactions"]
by = ["merch_postal_code"]
scale=10.0
column="transaction_type"
entry="OFFLINE"
hotspot_predictor=(
    make_filter_offline(column,entry)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(columns, by, bounds, scale)
)
print(hotspot_predictor.map(1))
output=hotspot_predictor(df_new)
print(output)

30917.40000000001
                        0
merch_postal_code        
7071                45765
55411              183692
70040               31856
70050               27510
70070               43102
...                   ...
9670000            108717
9710000            106582
9750000             84828
9790000            128256
9810000             83009

[303 rows x 1 columns]


In [150]:
df_new = df.copy()
bounds = (0, 454)
start_date, end_date = datetime(2020, 9, 1), datetime(2021, 3, 31)
columns = ["nb_transactions"]
by = ["merch_postal_code","date"]
scale=10.0
column="merch_category"
entry="Drug Stores/Pharmacies"
hotspot_predictor=(
    make_filter_offline(column,entry)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(columns, by, bounds, scale)
)
print(hotspot_predictor.map(1))
output=hotspot_predictor(df_new)
print(output)

30917.40000000001
                                 0
merch_postal_code date            
7071              2020-09-01  2014
                  2020-09-08  1946
                  2020-09-15  1923
                  2020-09-22  1940
                  2020-09-29  1896
...                            ...
9810000           2021-03-02  1836
                  2021-03-09  1851
                  2021-03-16  1899
                  2021-03-23  1907
                  2021-03-30  1923

[9238 rows x 1 columns]


In [151]:
df_new = df.copy()
bounds = (0, 454)
start_date, end_date = datetime(2020, 9, 1), datetime(2021, 3, 31)
columns = ["nb_transactions"]
by = ["merch_postal_code","date"]
scale=10.0
column="merch_category"
entry="Airlines"
hotspot_predictor=(
    make_filter_offline(column,entry)
    >>make_truncate_time(start_date, end_date, time_col)
    >>make_private_sum_by(columns, by, bounds, scale)
)
print(hotspot_predictor.map(1))
output=hotspot_predictor(df_new)
print(output)

30917.40000000001
                               0
merch_postal_code date          
7071              2020-09-01   9
                  2020-09-08  25
                  2020-09-15  18
                  2020-09-22   0
                  2020-09-29   6
...                           ..
9810000           2021-03-02  58
                  2021-03-09  59
                  2021-03-16  71
                  2021-03-23  53
                  2021-03-30  65

[6291 rows x 1 columns]


In [None]:
def make_select_grouping_cols(candidates, min_bin_size, d_in, d_out):
    """Create a measurement that selects a set of grouping columns from `candidates`."""
    def make(s):
        return (
            make_grouping_cols_score(candidates, min_bin_size)
            >> dp.m.then_report_noisy_max_gumbel(s, optimize="max")
            >> (lambda idx: candidates[idx])
        )

    return dp.binary_search_chain(make, d_in, d_out, T=float)

In [28]:
def hotspot_detection(df, start_date, end_date, time_col):
    """DP function that detects hotspots."""
    df_new = df.copy()
    bounds = (0, 454)

    def make_filter_offline():
        """filters offline entries"""
        return dp.t.make_user_transformation(
        input_domain=dataframe_domain(),
        input_metric=dp.symmetric_distance(),
        output_domain=dp.vector_domain(dp.atom_domain(T=T)),
        output_metric=dp.symmetric_distance(),
        function=lambda df: df[(df["transaction_type"] == "OFFLINE")],
        stability_map=lambda d_in: d_in,
    )




    make_filter_offline>>make_truncate_time(start_date, end_date, time_col)>>make_private_sum_by()
    