# ETL gal data for ITpS analyses

In [4]:
## Libs 
import pandas as pd
import hashlib
from epiweeks import Week, Year


In [12]:
# pwd

'/Users/bragatte/Documents/GitHub/arbo'

In [13]:
## read files combined_arbo.xlsx and gal_am_12_12_23.txt 
### ITpS
combined_arbo = pd.read_excel('EDA/data/combined/combined_arbo.xlsx')
### GAL
gal_am_12_12_23 = pd.read_csv('EDA/data/cglab/gal_am_12_12_23', sep=';')

In [7]:
## Check structure of files
### ITpS
print(combined_arbo.head())
## list of columns
print(combined_arbo.columns)

     lab_id                         sample_id             test_id  \
0  EINSTEIN  00007d978f767d3e5e78646ff0214a97  000002022356010666   
1  EINSTEIN  0000fc550dc879819f58d15bff2dce85  000002023280006519   
2  EINSTEIN  000288fa9a10907547856f9169a35a93  000002023325009566   
3  EINSTEIN  0004593b7fd083f1e9fd5d3f489300ad  000002022157012714   
4  EINSTEIN  0006cc6607014fd4b1f36342ce593b67  000002023029005148   

      test_kit gender   age   location date_testing      state patient_id  \
0    igm_serum      M  44.0  SAO PAULO   2022-12-22  SAO PAULO        NaN   
1    igg_serum      F   5.0  SAO PAULO   2023-10-07  SAO PAULO        NaN   
2    igg_serum      F  49.0  SAO PAULO   2023-11-21  SAO PAULO        NaN   
3    igm_serum      F  15.0  SAO PAULO   2022-06-06  SAO PAULO        NaN   
4  ns1_antigen      M  19.0  SAO PAULO   2023-01-29  SAO PAULO        NaN   

   ... month country   region macroregion macroregion_code state_code  \
0  ...    12  BRASIL  SUDESTE       RRAS6        

In [14]:
## Check structure of files
### GAL
print(gal_am_12_12_23.head())
## list of columns
print(gal_am_12_12_23.columns)

   Unnamed: 0    requisicao                          setor  \
0           1  230049002412  GDI - Diagnóstico Imunológico   
1           2  230049002413  GDI - Diagnóstico Imunológico   
2           3  230049002414  GDI - Diagnóstico Imunológico   
3           4  230049002415  GDI - Diagnóstico Imunológico   
4           5  230049002416  GDI - Diagnóstico Imunológico   

              bancada mun_residencia uf_residencia       requisitante  \
0  Dengue - Sorologia         MANAUS            AM  UPA CAMPOS SALLES   
1  Dengue - Sorologia         MANAUS            AM  UPA CAMPOS SALLES   
2  Dengue - Sorologia         MANAUS            AM  UPA CAMPOS SALLES   
3  Dengue - Sorologia         MANAUS            AM  UPA CAMPOS SALLES   
4  Dengue - Sorologia         MANAUS            AM  UPA CAMPOS SALLES   

  mun_requisitante                             exame             metodo  ...  \
0           MANAUS  Dengue, Detecção de Antígeno NS1  Enzimaimunoensaio  ...   
1           MANAUS  Dengue, 

## MAP columns and variables

In [15]:
## Create a new DataFrame with the same columns as combined_arbo
## Initially, all columns are empty
combined_structure_columns = [
    'lab_id', 'sample_id', 'test_id', 'test_kit', 'gender', 'age',
    'location', 'date_testing', 'state', 'patient_id', 'file_name',
    'denv_test_result', 'zikv_test_result', 'chikv_test_result',
    'yfv_test_result', 'mayv_test_result', 'orov_test_result',
    'wnv_test_result', 'qty_original_lines', 'created_at', 'updated_at',
    'age_group', 'epiweek', 'month', 'country', 'region', 'macroregion',
    'macroregion_code', 'state_code', 'state_ibge_code',
    'location_ibge_code', 'lat', 'long'
]
new_df = pd.DataFrame(columns=combined_structure_columns)

## Populate columns according to instructions
new_df['lab_id'] = 'CGLAB'  # Fixed value
new_df['test_id'] = gal_am_12_12_23['cod_amostra']
new_df['test_kit'] = gal_am_12_12_23['exame'].replace({'Dengue, Detecção de Antígeno NS1': 'ns1_antigen'})
new_df['location'] = gal_am_12_12_23['mun_residencia']
new_df['state_code'] = gal_am_12_12_23['uf_residencia']
new_df['date_testing'] = pd.to_datetime(gal_am_12_12_23['dt_cadastro'], format='%d/%m/%Y %H:%M:%S')
new_df['denv_test_result'] = gal_am_12_12_23['resultado'].apply(
    lambda x: 'Pos' if x.strip() == 'Reagente' else 
    ('Neg' if x.strip() == 'Não Reagente' else 'NT')
)

def date_to_epiweek(date):
    """
    Convert a date to an epidemiological week.
    
    :param date: Date to be converted.
    :return: A string representing the epidemiological week.
    """
    try:
        date = pd.to_datetime(date)
        epiweek = Week.fromdate(date, system="cdc")
        return str(epiweek)
    except:
        # Return None or some default value if the date is invalid
        return None

# Apply the function to the date_testing column
new_df['epiweek'] = new_df['date_testing'].apply(date_to_epiweek)

## Generate hash for sample_id based on specific columns of gal_am_12_12_23
def generate_hash(row):
    hash_input = str(row['requisicao']) + str(row['cod_amostra'])
    return hashlib.sha1(hash_input.encode('utf-8')).hexdigest()

## Applying the hash generation function for the sample_id column
new_df['sample_id'] = gal_am_12_12_23.apply(generate_hash, axis=1)

## Save the transformed DataFrame to a TSV file
tsv_file_path = 'EDA/results/combined_arbo_gal.tsv'
new_df.to_csv(tsv_file_path, sep='\t', index=False)