## ETL Gal data into combined format

In [None]:
## Libs 
import pandas as pd
import hashlib

In [None]:
pwd

In [None]:
## read files combined_arbo.xlsx and gal_am_12_12_23.txt 
### ITpS
combined_arbo = pd.read_excel('combined_arbo.xlsx')
### GAL
gal_am_12_12_23 = pd.read_csv('gal_am_12_12_23', sep=';')

In [None]:
## Check structure of files
### ITpS
print(combined_arbo.head())
## list of columns
print(combined_arbo.columns)

In [None]:
## Check structure of files
### GAL
print(gal_am_12_12_23.head())
## list of columns
print(gal_am_12_12_23.columns)

## Map columns

In [None]:
## Create a new DataFrame with the same columns as combined_arbo
## Initially, all columns are empty
combined_structure_columns = [
    'lab_id', 'sample_id', 'test_id', 'test_kit', 'gender', 'age',
    'location', 'date_testing', 'state', 'patient_id', 'file_name',
    'denv_test_result', 'zikv_test_result', 'chikv_test_result',
    'yfv_test_result', 'mayv_test_result', 'orov_test_result',
    'wnv_test_result', 'qty_original_lines', 'created_at', 'updated_at',
    'age_group', 'epiweek', 'month', 'country', 'region', 'macroregion',
    'macroregion_code', 'state_code', 'state_ibge_code',
    'location_ibge_code', 'lat', 'long'
]
new_df = pd.DataFrame(columns=combined_structure_columns)

## Populate columns according to instructions
new_df['lab_id'] = 'CGLAB'  # Fixed value
new_df['test_id'] = gal_am_12_12_23['cod_amostra']
new_df['test_kit'] = gal_am_12_12_23['exame'].replace({'Dengue, Detecção de Antígeno NS1': 'NS1_antigen'})
new_df['location'] = gal_am_12_12_23['mun_residencia']
new_df['state_code'] = gal_am_12_12_23['uf_residencia']
new_df['date_testing'] = pd.to_datetime(gal_am_12_12_23['dt_cadastro'], format='%d/%m/%Y %H:%M:%S')
new_df['denv_test_result'] = gal_am_12_12_23['resultado'].replace({'Reagente': 'Pos', 'Não Reagente': 'Neg'})

## Generate hash for sample_id based on specific columns of gal_am_12_12_23
def generate_hash(row):
    hash_input = str(row['requisicao']) + str(row['cod_amostra'])
    return hashlib.sha256(hash_input.encode()).hexdigest()

## Applying the hash generation function for the sample_id column
new_df['sample_id'] = gal_am_12_12_23.apply(generate_hash, axis=1)

## Save the transformed DataFrame to a TSV file
tsv_file_path = 'combined_arbo_gal.tsv'
new_df.to_csv(tsv_file_path, sep='\t', index=False)
