## Worflow for PRECISE Data in restructured format

This workflow is for the PRECISE ANC surveillance data in the final format (restructured) which will be used for future data sharing.
Combined ANC surveillance data for all 3 sites (Gambia, Kenya and Mozambique) is contained in separate .dta files

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
import os
import pandas as pd
import numpy as np
from datetime import timedelta as td
from DataAnalyst.precise import Precise, Tracer, extract_villages

In [3]:
root_dir=r"G:\My Drive\PRECISE_surveillance_data\Precise_structured_data_032023"

In [4]:
cols1=[
    #profile info
    'f2a_participant_id', 'f2_visit_date', 'f2_ga_at_visit', 'redcap_event_name',
    #spatial access indicators
    'f2_location_from', 'f2_location_from_name', 
    'f2_location_from_other', 'f2_mode_of_transport_1_4', 'f2_mode_of_transport_other_1_4', 
    'f2_travel_duration', 'f2_woman_addr', 'country'
]
cols2=[
    #sociogeo indicators
    'f2a_participant_id', 'f3_highest_school_level', 'f3_religion', 'f3_marital_status', 
    'f3_live_with_partner', 'f3_occupation', 'f3_duration_of_living_together', 'f3_year_of_birth'
]
cols3=[
    #sociogeo indicators
    #3 variable names have been truncated during conversion from ODK to stata
    'f2a_participant_id', 'f3_neighbor_help_pregnancy_probl', 'f3_form_of_help_received', 'f3_community_help_pregnancy_prob',
    'f3_participation_in_community_gr', 'f3_decision_maker_money', 'f3_decision_maker_pregnancy', 
    'f3_woman_has_money_for_transport', 'f3_toilet_facility'
]
gen_info=pd.read_stata(os.path.join(root_dir, "T02_general_info.dta"), index_col='f2a_participant_id', convert_categoricals=False)
baseline=pd.read_stata(os.path.join(root_dir, 'T03_baseline_maternal.dta'), index_col='f2a_participant_id', columns=cols2)
enviro=pd.read_stata(os.path.join(root_dir, 'T04_environment.dta'), index_col='f2a_participant_id', columns=cols3)
# [(gen_info['redcap_event_name']=='postpartum_mother_arm_1')

In [9]:
countries={
    'The Gambia': 'gm',
    'Kenya': 'ke',
    'Mozambique': 'mz'
}
def split_visits(country):
   
    cols=cols1.copy()
    if country=='Mozambique':
        cols.extend(gen_info.filter(like='f2_{}_n'.format(countries[country])).columns.to_list())
    else:
        cols.extend(gen_info.filter(like='f2_{}_v'.format(countries[country])).columns.to_list())
    
    df=pd.read_stata(os.path.join(root_dir, "T02_general_info.dta"), index_col='f2a_participant_id', columns=cols)
    df=(df.merge(baseline, on='f2a_participant_id', validate='m:1')
       .merge(enviro, on='f2a_participant_id', validate='m:1'))
    for v in ['precise_visit_1_arm_1', 'precise_visit_2_arm_1', 'birth_mother_arm_1', 'postpartum_mother_arm_1']:
        df[(df['country']==country)&(df['redcap_event_name']==v)].to_csv(os.path.join(root_dir, country, v+".csv"))
    return

In [10]:
for country in countries: split_visits(country) 