In [1]:
import os
import math
import numpy as np
import pandas as pd

# from utils import *
from IPython.display import JSON

palette = color = ['#223E5C','#5EACA3','#F1D670','#DC6046']
palette_cb = ['#364B9A','#6EA6CD','#C2E4EF','#EAECCC','#FEDA8B','#F67E4B','#A50026']
palette_cb_long = ['#364B9A','#4A7BB7','#6EA6CD','#98CAE1','#C2E4EF','#EAECCC','#FEDA8B','#FDB366','#F67E4B','#DD3D2D','#A50026']

location_map = {
    'cohabitant':'home',
    'home':'home',
    'homeguest':'leisure',
    'work':'work',
    'school':'school',
    'leisure':'leisure',
    'shopping':'other',
    'restaurant':'leisure',
    'transport':'transport',
    'otherindoor':'other',
    'otheroutdoor':'other'
}

In [2]:
remove_isolated = True
non_isolated = [4] # 2: self-surveillance, 4: normal-behaviour

path_to_data = "data_in/"
cs_fname = "clean_contacts_proc_prolonged_proportional_setting.csv"
ri_fname = "clean_respondent_info_proc.csv"

cs = pd.read_csv(path_to_data+cs_fname)
rinfo = pd.read_csv(path_to_data+ri_fname)

## Filtering isolated respondents
cs['location_fine_multi'] = cs['location'].replace('conviventi','cohabitant')
cs['loc_env'] = cs['location_fine_multi'].map(location_map)
if remove_isolated:
    respondent_info = rinfo[rinfo['isolation'].isin(non_isolated)].copy()
    contacts = cs[cs['caseid'].isin(respondent_info['caseid'])].copy()
else:
    contacts = cs.copy()
    respondent_info = rinfo.copy()
    

## Filtering out all respondent not included in the statistical model:
datamod = pd.read_csv(f'{path_to_data}datamod.csv')
datamod_child = pd.read_csv(f'{path_to_data}datamod_children.csv')

datamod = datamod[(datamod['total_contacts_prol']<=100)]
datamod_child = datamod_child[(datamod_child['total_contacts_prol']<=100)&(datamod_child['total_contacts_prol']!=0)]
datamod_all = pd.concat([datamod,datamod_child], ignore_index=True)

# Keep only EPID by wave
respondent_info = datamod_all[['EPID','wave','total_contacts_prol','total_contacts_prol_soft','d_vacc2']].merge(respondent_info, on=['EPID','wave'])

case_to_epid = respondent_info.set_index('caseid')['EPID'].to_dict()
contacts['EPID'] = contacts['caseid'].map(case_to_epid)
contacts['new_setting'] = contacts['loc_env'].map({'home':'home','work':'work','school':'school','leisure':'leisure','other':'other','transport':'transport'})
contacts = contacts[contacts['EPID'].notna()]

# setting new column "respondent_education"
def get_edu(x):
    age = x['respondent_age']
    edu = x['education']
    if age<18:
        edu = '1'
    elif math.isnan(edu):
        edu = 'nan'
    else:
        edu = str(int(x['education']))
    return {'1':'1. Lower Secondary or below', '2':'2. Higher Secondary', '3':'3. Bachelor or above', 'nan':np.nan}[edu]

respondent_info['respondent_education'] = respondent_info.apply(get_edu, axis=1)
respondent_info['respondent_occupation'] = respondent_info['occupation_agg']
# note that occupation for respondents with 5<age<16 is set to "Student" by default (0<age<6 are students only if they go to kindergarten, otherwise "Inactive")
respondent_info.loc[respondent_info[(respondent_info['kindergarten_05']==1)|((respondent_info['respondent_age']<16)&(respondent_info['respondent_age']>5))].index,'occupation_agg'] = 3 
respondent_info.loc[respondent_info[(respondent_info['kindergarten_05']==0)].index,'occupation_agg'] = 5 

In [3]:
# Here we share only direct contacts. Indirect contacts can be reconstructed through "participants_extra" data using the "indoor_prolonged" column
# != soft assumes all indoor contacts were reported also as prolonged contacts (best case => less contacts)
rsH = respondent_info.copy()                        # respondents to keep
csH = contacts[(contacts['is_soft'].isna())&(contacts['caseid'].isin(rsH['caseid']))].copy() # condition on contacts (conditioned on being in rsH)

### Data to use
contactToPlot = csH.sort_values('respondent_age').copy()
respondentToPlot = rsH.sort_values('respondent_age').copy()

In [17]:
### Main Data Objects
participants_common = respondentToPlot[['caseid','EPID','respondent_age','respondent_gender','start_date_module2A','wave','respondent_sample']].copy()
participants_extra = respondentToPlot[['caseid','EPID','presence_work','presence_school','income','respondent_occupation','respondent_education','d_vacc2','region_grouped_IT','hh_size_det','children_number','senior65_cohabitant','senior70_cohabitant','senior75_cohabitant','senior80_cohabitant','c_sharedindoor','wave']].copy()

contacts_common = contactToPlot[['caseid','EPID','c_age','c_gender','loc_env','c_physical','c_frequency']].copy()
contacts_extra = contactToPlot[['caseid','EPID','c_age','c_distance','c_relationship','loc_env','location_fine_multi','indoor','is_personal','is_soft']].copy()

In [18]:
### Participant common preparation
participants_common = participants_common.sort_values(['start_date_module2A','respondent_age']).reset_index(drop=True)

participants_common = participants_common.rename(columns={'caseid':'part_id','EPID':'respondent_id','respondent_age':'part_age_exact','respondent_gender':'part_gender','start_date_module2A':'survey_date'})
participants_common['part_gender'] = participants_common['part_gender'].map({1:'M',2:'F'})

caseid_part_id_map = {caseid:i+1 for i, caseid in enumerate(participants_common['part_id'].unique())}
epid_respondent_id_map = {epid:i+1 for i, epid in enumerate(participants_common['respondent_id'].unique())}

participants_common['part_id'] = participants_common['part_id'].map(caseid_part_id_map)
participants_common['hh_id'] = 'HH' + participants_common['part_id'].astype(str)
participants_common = participants_common[['part_id','hh_id', 'part_age_exact', 'part_gender', 'wave','respondent_id','respondent_sample','survey_date']]

participants_common['part_age_est_min'] = np.nan
participants_common['part_age_est_max'] = np.nan

pc_all = participants_common.copy()

participants_common = participants_common.drop(columns=['wave','respondent_id','respondent_sample','survey_date']).sort_values('part_id').reset_index(drop=True)

participants_common.to_csv('data_out/MixIT/participant_common_MixIT_2022_2023.csv', index=False)

In [6]:
print(participants_common.shape)
participants_common.head(2)

(4979, 6)


Unnamed: 0,part_id,hh_id,part_age_exact,part_gender,part_age_est_min,part_age_est_max
0,1,HH1,31,M,,
1,2,HH2,5,F,,


In [19]:
### Participant extra preparation
participants_extra['part_id'] = participants_extra['caseid'].map(caseid_part_id_map)
participants_extra['panel_id'] = participants_extra['EPID'].map(epid_respondent_id_map)
participants_extra = participants_extra.sort_values('part_id').reset_index(drop=True)
participants_extra['indirect_contacts'] = participants_extra['c_sharedindoor']
for cl, mp in {'presence_work':{1:'Remote work',2:'In-person work',3:'No work'},
               'presence_school':{1:'Remote attendance',2:'In-person attendance',3:'No attendance'},
               # 'income':{1:'Less than 1000€',2:'1000€-1499€',3:'1500€-1999€',4:'2000€-2499€',
               #           5:'2500€-2999€',6:'3000€-3499€',7:'3500€-3999€',8:'4000€-4999€',
               #           9:'5000€ or more',933:np.nan},
               'respondent_occupation':{1:'Employed',2:'Home/family care',3:'Student',4:'Retired',5:'Inactive'},
               'respondent_education':{'1. Lower Secondary or below':'Lower Secondary or below', 
                                       '2. Higher Secondary':'Upper Secondary', 
                                       '3. Bachelor or above':'Bachelor or above'},
               'region_grouped_IT':{1:"North-West",2:"North-East",3:"Centre",4:"South",5:"Islands"}
                               }.items():
    participants_extra[cl] = participants_extra[cl].map(mp)
participants_extra = participants_extra.rename(columns={'income':'household_monthly_net_income','hh_size_det':'hh_size',
                                                        'children_number':'hh_size_0-17','senior65_cohabitant':'hh_size_65+',
                                                        'senior70_cohabitant':'hh_size_70+','senior75_cohabitant':'hh_size_75+',
                                                        'senior80_cohabitant':'hh_size_80+','respondent_occupation':'occupation',
                                                        'respondent_education':'educational_attainment', 'd_vacc2':'primary_vaccination_cycle',
                                                        'region_grouped_IT':'region_nuts1'})
pe_all = participants_extra.copy()

cols = ['part_id','panel_id','presence_work','presence_school','educational_attainment','region_nuts1','occupation','primary_vaccination_cycle','indirect_contacts','wave','EPID','caseid']
participants_extra = participants_extra[cols].drop(columns=['wave','EPID','caseid']).sort_values('part_id').reset_index(drop=True)

participants_extra.to_csv('data_out/MixIT/participant_extra_MixIT_2022_2023.csv', index=False)

In [20]:
print(participants_extra.shape)
participants_extra.head(2)

(4979, 9)


Unnamed: 0,part_id,panel_id,presence_work,presence_school,educational_attainment,region_nuts1,occupation,primary_vaccination_cycle,indirect_contacts
0,1,1,In-person work,,Bachelor or above,North-East,Employed,Yes,1
1,2,2,,In-person attendance,Lower Secondary or below,South,Home/family care,No,25


In [21]:
####################
#### Add household common file (with hh_id==part_id, and hh_size)
cols = ['part_id','household_monthly_net_income','hh_size','hh_size_0-17','hh_size_65+','hh_size_70+','hh_size_75+','hh_size_80+','wave']
household_common = pe_all[cols].copy()
household_common['hh_id'] = 'HH' + household_common['part_id'].astype(str)
household_common = household_common[['hh_id']+cols]

# Adding household_extra
household_extra = household_common[['hh_id','hh_size_0-17','hh_size_65+','hh_size_70+','hh_size_75+','hh_size_80+','household_monthly_net_income','wave']].copy()
household_common = household_common[['hh_id','hh_size','wave']] 
household_common['country'] = 'IT'

household_common = household_common.drop(columns=['wave']).sort_values('hh_id').reset_index(drop=True)
household_extra = household_extra.drop(columns=['wave']).sort_values('hh_id').reset_index(drop=True)
household_extra['household_monthly_net_income'] = household_extra['household_monthly_net_income'].replace(933,np.nan)

household_common.to_csv('data_out/MixIT/household_common_MixIT_2022_2023.csv', index=False)
household_extra.to_csv('data_out/MixIT/household_extra_MixIT_2022_2023.csv', index=False)


####################
#### Add survey day common file (with sd_id)
survey_day_common = pc_all.copy()
survey_day_common['survey_date'] = survey_day_common['survey_date'].apply(lambda x: pd.to_datetime(x.split(' ')[0]))
survey_day_common['dayofweek'] = (survey_day_common['survey_date']+pd.DateOffset(1)).dt.weekday
survey_day_common['day'] = survey_day_common['survey_date'].dt.day
survey_day_common['month'] = survey_day_common['survey_date'].dt.month
survey_day_common['year'] = survey_day_common['survey_date'].dt.year
survey_day_common['sday_id'] = survey_day_common['survey_date'].apply(lambda x: str(x).replace('-','').split(' ')[0])

part_id_sday_id_map = survey_day_common.set_index('part_id')['sday_id'].to_dict()
tmp_part_id_sday_id = participants_common[['part_id']].copy()
tmp_part_id_sday_id['sday_id'] = tmp_part_id_sday_id['part_id'].map(part_id_sday_id_map)
# cols = ['sday_id','survey_date','dayofweek','day','month','year','wave']
survey_day_common = survey_day_common.drop_duplicates().sort_values('sday_id').reset_index(drop=True)
survey_day_common = survey_day_common.merge(tmp_part_id_sday_id)

survey_day_common = survey_day_common.reset_index(drop=True)[['part_id','sday_id','dayofweek','day','month','year','wave']]

survey_day_common.to_csv('data_out/MixIT/sday_MixIT_2022_2023.csv', index=False)

In [22]:
print(household_extra.shape)
household_extra.head(2)

(4979, 7)


Unnamed: 0,hh_id,hh_size_0-17,hh_size_65+,hh_size_70+,hh_size_75+,hh_size_80+,household_monthly_net_income
0,HH1,0,0,0,0,0,4.0
1,HH10,0,0,0,0,0,7.0


In [23]:
print(survey_day_common.shape)
survey_day_common.head(2)

(4979, 7)


Unnamed: 0,part_id,sday_id,dayofweek,day,month,year,wave
0,1,20220316,3,16,3,2022,1
1,201,20220316,3,16,3,2022,1


In [24]:
### Contacts common preparation
contacts_common['part_id'] = contacts_common['caseid'].map(caseid_part_id_map)
contacts_common['sday_id'] = contacts_common['part_id'].map(part_id_sday_id_map)
contacts_common = contacts_common.sort_values(['part_id','c_age']).reset_index(drop=True)
contacts_common['cont_id'] = contacts_common.index + 1
contacts_common['cnt_gender'] = contacts_common['c_gender'].map({1:'M',2:'F'})
contacts_common['cnt_age_exact'] = contacts_common['c_age']
contacts_common['cnt_age_est_min'] = np.nan
contacts_common['cnt_age_est_max'] = np.nan

contacts_common['location_multi'] = contacts_common['loc_env']
contacts_common['frequency_multi'] = contacts_common['c_frequency']
contacts_common['duration_exact'] = np.nan
contacts_common['duration_multi'] = np.nan
contacts_common['phys_contact'] = contacts_common['c_physical'].map({1:True,2:False})

contacts_common['wave'] = contacts_common['part_id'].map(pc_all.set_index('part_id')['wave'].to_dict())
contacts_common = contacts_common[['part_id','cont_id', 'cnt_gender', 'cnt_age_exact', 'cnt_age_est_min', 
                                   'cnt_age_est_max', 'frequency_multi', 'location_multi', 'duration_exact', 'duration_multi',
                                   'phys_contact','wave']]
for col in contacts_common['location_multi'].unique():
    contacts_common[f'cnt_{col}'] = contacts_common['location_multi'] == col

contacts_common = contacts_common.drop(columns=['wave','location_multi']).rename(columns={'cnt_other':'cnt_otherplace'}).reset_index(drop=True)

contacts_common.to_csv('data_out/MixIT/contact_common_MixIT_2022_2023.csv', index=False)

In [25]:
### Contacts extra preparation
contacts_extra['part_id'] = contacts_extra['caseid'].map(caseid_part_id_map)
contacts_extra = contacts_extra.sort_values(['part_id','c_age']).reset_index(drop=True)
contacts_extra['cont_id'] = contacts_extra.index + 1
contacts_extra['sday_id'] = (contacts_extra['part_id'].map(part_id_sday_id_map)).astype(int)

contacts_extra['distance'] = contacts_extra['c_distance'].map({1:'Less than 1 meter',2:'Between 1 and 2 meters',3:'More than 2 meters',0:'Pyshical contact'})
contacts_extra['relationship'] = contacts_extra['c_relationship'].map({1:'Cohabitant', 2:'Non-cohabitant relative/family member', 3:'Friend or acquaintance',4:'Coworker',5:'School/college mate',6:'Other'})
contacts_extra['location_multi'] = contacts_extra['loc_env'].str.title()
contacts_extra['location_fine_multi'] = contacts_extra['location_fine_multi'].str.title()
contacts_extra['setting'] = contacts_extra['indoor'].map({1:'indoor',2:'outdoor'})

contacts_extra['wave'] = contacts_extra['part_id'].map(pc_all.set_index('part_id')['wave'].to_dict())
contacts_extra = contacts_extra[['wave','cont_id','distance','relationship','setting','location_multi','location_fine_multi']]
contacts_extra = contacts_extra.drop(columns=['wave']).reset_index(drop=True)

contacts_extra.to_csv('data_out/MixIT/contact_extra_MixIT_2022_2023.csv', index=False)

In [26]:
print(contacts_common.shape)
contacts_common.head(2)

(23718, 16)


Unnamed: 0,part_id,cont_id,cnt_gender,cnt_age_exact,cnt_age_est_min,cnt_age_est_max,frequency_multi,duration_exact,duration_multi,phys_contact,cnt_home,cnt_work,cnt_otherplace,cnt_leisure,cnt_transport,cnt_school
0,1,1,M,27,,,10.0,,,False,True,False,False,False,False,False
1,1,2,F,31,,,14.0,,,True,True,False,False,False,False,False


In [27]:
print(contacts_extra.shape)
contacts_extra.head(2)

(23718, 6)


Unnamed: 0,cont_id,distance,relationship,setting,location_multi,location_fine_multi
0,1,Less than 1 meter,Coworker,indoor,Home,Home
1,2,Pyshical contact,Cohabitant,indoor,Home,Cohabitant


### generating Dictionary -> excel file
*"dictionary_MixIT_2022_2023.xlsx"*