<div style="background-color: #f8d7da; color: #721c24; padding: 20px; margin-bottom: 20px; border: 1px solid #f5c6cb; border-radius: 4px;">
    <strong style="font-size: 24px;">Warning</strong> <br> 
    <strong>You have launched the 000.3-Prolonged_contacts.ipynb notebook.-</strong>
    <ul style="color: #721c24;">
        <li>This notebook runs with a <strong>python 3.6+</strong> kernel!</li>
        <li>It loads data from previous preprocessing notebook (<strong>000.2-VaccineData_Preprocessing.ipynb</strong>)</li>
        <li><strong>It outputs processed data overwriting existing files<strong>: please use this with caution.</li>
    </ul>
</div>

In [1]:
import pandas as pd
import numpy as np

In [2]:
path_to_data = "../data_in/"
cs_fname = "clean_contacts_proc.csv"
ri_fname = "clean_respondent_info_proc.csv"

cs = pd.read_csv(path_to_data+cs_fname)
ri = pd.read_csv(path_to_data+ri_fname)
# adding respondent occupation for all respondents (children are mapped as students if they either go to kindergarten or to school)
ri['respondent_occupation'] = ((ri['kindergarten_05']== 1)|((ri['respondent_age']>5)&(ri['respondent_age']<16))).where((ri['kindergarten_05']== 1)|((ri['respondent_age']>5)&(ri['respondent_age']<16)), ri['occupation'].map({1:2,2:2,3:2,5:1,6:1,7:1})).map({1:'school',2:"work"})
ri.loc[ri[ri['presence_school']!=2].index,'respondent_occupation'] = np.nan
ri.loc[ri[ri['presence_work']!=2].index,'respondent_occupation'] = np.nan

# adding non-student and non-workers main location type
non_occupated_map = cs[~cs['location'].isin(['conviventi','school','work'])].groupby(['caseid','location'],as_index=False)['contact_number'].count().sort_values('contact_number', ascending=False).drop_duplicates(['caseid']).set_index('caseid')['location'].to_dict()
ri.loc[ri['respondent_occupation'].isna(),'respondent_occupation'] = ri.loc[ri['respondent_occupation'].isna(),'caseid'].map(non_occupated_map)

cs = cs.merge(ri[['caseid','respondent_occupation']],on='caseid')

### Completing CONTACTS DATA:
#### Completion is performed either:
     1- neglecting "prolonged_contacts" (i.e. "c_sharedindoor" column in "ri")
     2- spreading "prolonged_contacts" in either school/work/other_setting_with_most_contacts (excluding contacts with cohabitants)
     3- spreading "prolonged_contacts" in all other settings proportionally to the number of contacts reported in each one of those (excluding contacts with cohabitants)
#### Spread is performed either by:
     1- including all prolonged_contacts
     2- subtracting to the prolonged_contacts number the numer of contacts already reported as "indoor_contacts"
#### Assignment of "age" (and potentially other contact variables) is performed via direct sampling of those params from indoor contacts of the respondent within a specific setting
#### If no contact is present in a specific setting then contact properties are sampled from all indoor contacts of same-age respondents + same vax-status + same presence @work/school

In [3]:
"""
    contact pd.DataFrame resulting from prolonged contact explosion consist of three additional columns:
        'is_personal' : 0/1 based on whether the contact information is sampled from collective or personal contacts
        'is_soft'     : 0 if sampling new contacts, 1 if sampled contact could be already reported as indoor contact in the diary
                        (simple count matching/subtraction based on the number of indoor contacts reported within the diary contacts)
        'is_age_match': 0/1 if sampling is based on exact age matching of respondent age 
                        (wheter with her/himself age (trivial if at least one contact is present) or with all other respondents)
"""
def sample_contacts(s_cs, respondent, cols=None):
    """
        s_cs: DataFrame containing setting specific contacts for all respondents (cohabitants are excluded)
        respondent: single respondent info (pd.Series)
        cols: list of columns to sample as contact properties (by Default select an entire contact row)

        return: a pd.DataFrame
    """
    if cols is None: cols = list(set(s_cs.columns).difference(set(respondent.index)))
    r_cols = list(set(s_cs.columns).intersection(set(respondent.index)))
    soft_n = respondent.loc['c_sharedindoor']
    hard_n = max(0,soft_n - respondent.loc['total_contacts_indoor'])
    scs = s_cs.copy()
    if respondent['presence_school']!=2: scs = scs[scs['location']!='school']
    if respondent['presence_work']!=2:   scs = scs[scs['location']!='work']
    if respondent['d_vacc']==2:          scs = scs[scs['d_vacc']!=1]
    if respondent['d_vacc']==1:          scs = scs[scs['d_vacc']==1]
    if soft_n > 0:
        # whether to sample from personal contacts (if any available)
        on_personal = scs[(scs['indoor']==1)&(scs['caseid']==respondent.loc['caseid'])]
        # or from collective: collective sampling is conditioned only on age...but we could try to add more here...
        on_collective = scs[(scs['indoor']==1)&(scs['respondent_age']==respondent.loc['respondent_age'])]
        ag = 1
        if on_collective.empty:
            on_collective = scs[(scs['indoor']==1)&(scs['age_group']==respondent.loc['age_group'])]
            ag = 0
        # print(on_personal.shape,'personal')
        # print(on_collective.shape,'collective')
        # print(respondent[['respondent_age','caseid']])
        if on_personal.empty:
            rsample = on_collective[cols].sample(soft_n, replace=True)
            rsample['is_personal'] = False
            rsample['is_soft'] = [False]*hard_n + [True]*min(soft_n,respondent.loc['total_contacts_indoor'])
            rsample['is_age_match'] = bool(ag)
            for c in r_cols:
                rsample[c] = respondent[c]
        else:
            rsample = on_personal[cols].sample(soft_n, replace=True)
            rsample['is_personal'] = True
            rsample['is_soft'] = [False]*hard_n + [True]*min(soft_n,respondent.loc['total_contacts_indoor'])
            rsample['is_age_match'] = bool(ag)
            for c in r_cols:
                rsample[c] = respondent[c]
        return rsample
def single_setting_completion(cs,ri):
    s_cs = cs[cs['respondent_occupation']==cs['location']]
    cs_copy = pd.concat(list(ri.apply(lambda respondent: sample_contacts(s_cs, respondent),axis=1)))
    return cs_copy
def proportional_setting_completion(cs,ri):
    s_cs = cs[cs['location']!='conviventi']
    cs_copy = pd.concat(list(ri.apply(lambda respondent: sample_contacts(s_cs, respondent),axis=1)))
    return cs_copy
### Sampling prolonged contact based on requested method
def completing_contacts_data(cs,ri,method=None):
    """
        cs: contact dataframe
        ri: respondent information dataframe
        method: whether to return exploded prolongued contact (or not) 
                assigning on a single setting or proportionally on all settings (cohabitant excluded)
        returns: contact dataframe concatenated with prolongued contacts
    """
    if method is None:
        cs_copy = cs.copy()
    elif method == 'single_setting':
        cs_copy = single_setting_completion(cs,ri)
        cs_copy = pd.concat([cs,cs_copy.astype({'is_personal':float, 'is_soft':float, 'is_age_match':float})], ignore_index=True)
    elif method == 'proportional_setting':
        cs_copy = proportional_setting_completion(cs,ri)
        cs_copy = pd.concat([cs,cs_copy.astype({'is_personal':float, 'is_soft':float, 'is_age_match':float})], ignore_index=True)
    else: raise Exception('Invalid METHOD option!')
    return cs_copy


In [4]:
cs_single = completing_contacts_data(cs,ri,method='single_setting')
cs_proportional = completing_contacts_data(cs,ri,method='proportional_setting')

cs_single.to_csv(path_to_data+"clean_contacts_proc_prolonged_single_setting.csv", index=True)
cs_proportional.to_csv(path_to_data+"clean_contacts_proc_prolonged_proportional_setting.csv", index=True)

In [5]:
cs_proportional.head(2)

Unnamed: 0,caseid,respondent_age,contact_number,indoor,c_physical,c_masks,c_perceived_income,c_education,c_relationship,c_frequency,...,c_outdoor_mask_nocohab,c_outdoor_nomask_nocohab,c_out_mask,c_in_nomask,c_risky,c_outdoor_cohab,respondent_occupation,is_personal,is_soft,is_age_match
0,852123553,51,157,1,1,1.0,4.0,3.0,1.0,14.0,...,,,0.0,1.0,,0,home,,,
1,852123553,51,158,1,1,1.0,4.0,3.0,1.0,14.0,...,,,0.0,1.0,,0,home,,,
