# Example of a use of Python-DP library 
## Safety, Privacy and Legal Aspects

In [1]:
# Install PyDP
!pip3 install python-dp

Collecting python-dp
  Downloading python_dp-1.1.1-cp37-cp37m-manylinux1_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 955 kB/s eta 0:00:01
[?25hInstalling collected packages: python-dp
Successfully installed python-dp-1.1.1


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pydp as dp
from pydp.algorithms.laplacian import BoundedMean
from pydp.algorithms.laplacian import BoundedSum

# ... other libraries if you need

#### Download from https://cnecovid.isciii.es/covid19/ the file *casos_tecnica_ccaa.csv* (available in *Documentación y datos*), which contains the number of daily covid cases in Spain by CCAA. Keep only the cases for the year 2021.

In [13]:
df_covid = pd.read_csv('casos_tecnica_ccaa.csv')
df_covid.head()

Unnamed: 0,ccaa_iso,fecha,num_casos,num_casos_prueba_pcr,num_casos_prueba_test_ac,num_casos_prueba_ag,num_casos_prueba_elisa,num_casos_prueba_desconocida
0,AN,2020-01-01,0,0,0,0,0,0
1,AR,2020-01-01,0,0,0,0,0,0
2,AS,2020-01-01,0,0,0,0,0,0
3,CB,2020-01-01,0,0,0,0,0,0
4,CE,2020-01-01,0,0,0,0,0,0


In [14]:
df_covid['year'] = list(map(lambda x: x.split("-")[0], df_covid["fecha"]))
df_covid['month'] = list(map(lambda x: x.split("-")[1], df_covid["fecha"]))
df_covid['day'] = list(map(lambda x: x.split("-")[2], df_covid["fecha"]))

In [16]:
df_covid.head()

Unnamed: 0,ccaa_iso,fecha,num_casos,num_casos_prueba_pcr,num_casos_prueba_test_ac,num_casos_prueba_ag,num_casos_prueba_elisa,num_casos_prueba_desconocida,year,month,day
0,AN,2020-01-01,0,0,0,0,0,0,2020,1,1
1,AR,2020-01-01,0,0,0,0,0,0,2020,1,1
2,AS,2020-01-01,0,0,0,0,0,0,2020,1,1
3,CB,2020-01-01,0,0,0,0,0,0,2020,1,1
4,CE,2020-01-01,0,0,0,0,0,0,2020,1,1


In [17]:
df_covid_2021 = df_covid[df_covid["year"] == "2021"]

In [18]:
df_covid_2021.head()

Unnamed: 0,ccaa_iso,fecha,num_casos,num_casos_prueba_pcr,num_casos_prueba_test_ac,num_casos_prueba_ag,num_casos_prueba_elisa,num_casos_prueba_desconocida,year,month,day
6954,AN,2021-01-01,1618,916,0,702,0,0,2021,1,1
6955,AR,2021-01-01,281,263,0,18,0,0,2021,1,1
6956,AS,2021-01-01,143,134,0,9,0,0,2021,1,1
6957,CB,2021-01-01,180,175,0,5,0,0,2021,1,1
6958,CE,2021-01-01,25,5,0,20,0,0,2021,1,1


####  Obtain the average number and total number of daily cases by Covid-19 for each month of 2021 (considering the daily cases of all CCAA). You can group the data by months as you like, e.g. you can use pd.DatetimeIndex. You can use loops to fill the vectors total_cases_month and mean_cases_month.

In [26]:
total_cases_month = df_covid_2021.groupby(["month"]).sum()
# At coordinate i, the total cases for month i+1
total_cases_month

Unnamed: 0_level_0,num_casos,num_casos_prueba_pcr,num_casos_prueba_test_ac,num_casos_prueba_ag,num_casos_prueba_elisa,num_casos_prueba_desconocida
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,891145,536949,69,271826,325,81976
2,260427,169312,4,72428,475,18208
3,167229,103298,0,52863,428,10640
4,239515,143682,2,81047,444,14340
5,150044,88569,2,51318,422,9733
6,144253,80046,1,54443,230,9533
7,703517,380346,5,289028,9,34129
8,330290,203319,3,113783,7,13178
9,77639,49600,2,23574,3,4460
10,50456,31524,0,16528,0,2404


In [28]:
mean_cases_month  = df_covid_2021.groupby(["month"]).mean()
# At coordinate i, the average number of cases for month i+1
mean_cases_month

Unnamed: 0_level_0,num_casos,num_casos_prueba_pcr,num_casos_prueba_test_ac,num_casos_prueba_ag,num_casos_prueba_elisa,num_casos_prueba_desconocida
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1512.979626,911.628183,0.117148,461.504244,0.551783,139.178268
2,489.524436,318.255639,0.007519,136.142857,0.892857,34.225564
3,283.920204,175.378608,0.0,89.750424,0.726655,18.064516
4,420.201754,252.073684,0.003509,142.187719,0.778947,25.157895
5,254.743633,150.371817,0.003396,87.127334,0.716469,16.524618
6,253.075439,140.431579,0.001754,95.514035,0.403509,16.724561
7,1194.426146,645.748727,0.008489,490.709677,0.01528,57.943973
8,560.764007,345.193548,0.005093,193.179966,0.011885,22.373514
9,136.208772,87.017544,0.003509,41.357895,0.005263,7.824561
10,85.663837,53.521222,0.0,28.061121,0.0,4.081494


####  Show the results obtained per month:

In [32]:
for i in range(12):
    print(f'Month: {i+1}. Total cases: {total_cases_month.iloc[i]["num_casos"]} --- Mean: {mean_cases_month.iloc[i]["num_casos"]} \n')

Month: 1. Total cases: 891145 --- Mean: 1512.9796264855688 

Month: 2. Total cases: 260427 --- Mean: 489.52443609022555 

Month: 3. Total cases: 167229 --- Mean: 283.9202037351443 

Month: 4. Total cases: 239515 --- Mean: 420.20175438596493 

Month: 5. Total cases: 150044 --- Mean: 254.74363327674024 

Month: 6. Total cases: 144253 --- Mean: 253.07543859649124 

Month: 7. Total cases: 703517 --- Mean: 1194.4261460101868 

Month: 8. Total cases: 330290 --- Mean: 560.7640067911715 

Month: 9. Total cases: 77639 --- Mean: 136.20877192982456 

Month: 10. Total cases: 50456 --- Mean: 85.66383701188455 

Month: 11. Total cases: 182077 --- Mean: 319.43333333333334 

Month: 12. Total cases: 1591276 --- Mean: 2701.6570458404076 



#### Obtain the average number and total number of cases by applying differential privacy. Specifically, use *BoundedSum* and *BoundedMean* from the Python PyDP library. See https://github.com/OpenMined/PyDP.

In [33]:
df_covid_2021 = df_covid_2021.reset_index()

In [47]:
df_covid_2021["month"] = df_covid_2021["month"].astype(int)

In [51]:
df_covid_2021

Unnamed: 0,index,ccaa_iso,fecha,num_casos,num_casos_prueba_pcr,num_casos_prueba_test_ac,num_casos_prueba_ag,num_casos_prueba_elisa,num_casos_prueba_desconocida,year,month,day
0,6954,AN,2021-01-01,1618,916,0,702,0,0,2021,1,01
1,6955,AR,2021-01-01,281,263,0,18,0,0,2021,1,01
2,6956,AS,2021-01-01,143,134,0,9,0,0,2021,1,01
3,6957,CB,2021-01-01,180,175,0,5,0,0,2021,1,01
4,6958,CE,2021-01-01,25,5,0,20,0,0,2021,1,01
...,...,...,...,...,...,...,...,...,...,...,...,...
6930,13884,ML,2021-12-31,83,29,0,54,0,0,2021,12,31
6931,13885,NC,2021-12-31,4302,385,0,3916,0,1,2021,12,31
6932,13886,PV,2021-12-31,9446,3180,0,6266,0,0,2021,12,31
6933,13887,RI,2021-12-31,1047,50,0,825,0,172,2021,12,31


In [54]:
privacy_budget = 0.8

total_cases_month_dp = []
mean_cases_month_dp  = []

for i in range(1,13):
    cases_month = df_covid_2021[df_covid_2021["month"] == i]["num_casos"].values
    x = BoundedSum(privacy_budget, 0,1,max(cases_month))
    y = BoundedMean(privacy_budget,0,1,max(cases_month))
    
    total_cases_month_dp.append(x.quick_result(cases_month))
    mean_cases_month_dp.append(y.quick_result(cases_month))

#### Show the results obtained for each month with and without applying DP:

In [59]:
for i in range(12):
    print(f'Month: {i+1}. Total cases: {total_cases_month.iloc[i]["num_casos"]} --- Total cases (DP): {total_cases_month_dp[i]}')
    print(f'Month: {i+1}. Mean cases: {mean_cases_month.iloc[i]["num_casos"]} --- Mean cases (DP): {mean_cases_month_dp[i]} \n')

Month: 1. Total cases: 891145 --- Total cases (DP): 875354
Month: 1. Mean cases: 1512.9796264855688 --- Mean cases (DP): 1520.7873576618858 

Month: 2. Total cases: 260427 --- Total cases (DP): 251977
Month: 2. Mean cases: 489.52443609022555 --- Mean cases (DP): 494.5013169863339 

Month: 3. Total cases: 167229 --- Total cases (DP): 171661
Month: 3. Mean cases: 283.9202037351443 --- Mean cases (DP): 279.4222742031492 

Month: 4. Total cases: 239515 --- Total cases (DP): 229164
Month: 4. Mean cases: 420.20175438596493 --- Mean cases (DP): 427.19961799265025 

Month: 5. Total cases: 150044 --- Total cases (DP): 137563
Month: 5. Mean cases: 254.74363327674024 --- Mean cases (DP): 278.5440399509713 

Month: 6. Total cases: 144253 --- Total cases (DP): 138334
Month: 6. Mean cases: 253.07543859649124 --- Mean cases (DP): 241.8399043915697 

Month: 7. Total cases: 703517 --- Total cases (DP): 690944
Month: 7. Mean cases: 1194.4261460101868 --- Mean cases (DP): 1145.0549704314553 

Month: 8. T

#### Compare in two bar plots the results obtained in each case applying and not applying DP (i.e *Total cases per month vs. Total cases per month with DP* and *Mean cases per month vs Mean cases per month with DP*).

In [67]:
import matplotlib

TypeError: bar() missing 1 required positional argument: 'height'