# Data Integration 2022

## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Load Database from Discharges

In [3]:
file = '../../Data/Databases/Discharges/EGRESOS_2022.txt'
egresos2022 = pd.read_table(file, delimiter = '|')
egresos2022.head()

  egresos2022 = pd.read_table(file, delimiter = '|')


Unnamed: 0,ID,CLUES,EGRESO,INGRE,DIAS_ESTA,TUHPSIQ,SERVHC,SERVHP,CVEEDAD,EDAD,...,PARTOS,ABORTOS,HAYPROD,TIPATEN,GESTAC,PRODUCTO,TIPNACI,PLANFAM,CESAREAS,TOTAL_PRODUCTOS
0,1,TLSSA001376,2022-06-22,2022-06-22,0,-1,-1,-1,5,33,...,,,8,8,88,8,8,88,,-1
1,2,MSSSA003003,2022-12-26,2022-12-26,0,-1,-1,-1,5,45,...,,,8,8,88,8,8,88,,-1
2,3,MSSSA003003,2022-12-01,2022-12-01,0,-1,-1,-1,5,42,...,0.0,0.0,8,8,88,8,8,88,0.0,-1
3,4,MSSSA002344,2022-12-06,2022-12-05,1,-1,-1,-1,5,21,...,0.0,0.0,2,8,25,8,8,88,1.0,-1
4,5,TLSSA002356,2022-11-19,2022-11-19,0,-1,-1,-1,5,20,...,,,8,8,88,8,8,88,,-1


## Keep only records from Aguascalientes

In [4]:
#Keep records where 'ENTIDAD' = Aguascalientes (1)
egresos2022 = egresos2022[egresos2022['ENTIDAD'] == 1]
#Keep records for hospitals in Ags (excluding 'Hospital de la Mujer' and 'Hospital Psiquiátrico')
ags = ['ASSSA000404', 'ASSSA000030', 'ASSSA000614', 'ASSSA000655', 'ASSSA000025', 'ASSSA001331']
egresos2022 = egresos2022[egresos2022['CLUES'].isin(ags)]

In [5]:
# Change ID data type to int
egresos2022['ID'] = egresos2022['ID'].astype('int64')

## Reassign discharge id's

In [6]:
#2021 ended with 298042
egresos2022['ID2'] = np.arange(298043,298043 + len(egresos2022))

In [7]:
#Number the records in chronological order by year
IDs_2022 = egresos2022[['ID','ID2']]

## Replace CVEEDAD and PROCED keys

In [8]:
# Update keys to match previous years (2010-2019)
egresos2022['CVEEDAD'] = egresos2022['CVEEDAD'].replace({2:0, 3:1, 4:2, 5:3})
egresos2022['PROCED'] = egresos2022['PROCED'].replace({4:6,5:4})

## Load Databases from Affections

In [9]:
file = '../../Data/Databases/Affections/AFECCIONES_2022.txt'
afecciones2022 = pd.read_table(file, delimiter = '|')
afecciones2022.head()

Unnamed: 0,ID,AFEC,NUMAFEC
0,1,R69X,1
1,2,I10X,1
2,3,I10X,1
3,3,O11X,2
4,3,Q612,3


## Associate affections with the new ID column

In [10]:
afec2022 = afecciones2022.merge(IDs_2022, how = 'inner', on = 'ID')
afec2022.head()

Unnamed: 0,ID,AFEC,NUMAFEC,ID2
0,1028842,U071,1,298043
1,1028843,R69X,1,298044
2,1028844,R69X,1,298045
3,1028845,R69X,1,298046
4,1028846,C910,1,298047


## Load Database from Procedures

In [11]:
file = '../../Data/Databases/Procedures/PROCEDIMIENTOS_2022.txt'
procedimientos2022 = pd.read_table(file, delimiter = '|')
procedimientos2022.head()

Unnamed: 0,ID,NUMPROMED,PROMED,TIPO,ANEST,QUIROF,tiempo_quirofano
0,1,1,3995,D,6,2,
1,2,1,3995,D,6,2,
2,3,1,3995,D,6,2,
3,5,1,6374,Q,4,1,00:30
4,6,1,3995,D,6,2,


In [12]:
#Split 'tiempo_quirofano' into 2 columns: 'QH' for hours and 'QM' for minutes
quirofano = procedimientos2022["tiempo_quirofano"].str.split(':', expand=True)
quirofano.columns = ['QH', 'QM']
procedimientos2022 = pd.concat([procedimientos2022, quirofano], axis=1)
procedimientos2022.drop(['tiempo_quirofano'], axis = 1, inplace = True)
procedimientos2022.head()

Unnamed: 0,ID,NUMPROMED,PROMED,TIPO,ANEST,QUIROF,QH,QM
0,1,1,3995,D,6,2,,
1,2,1,3995,D,6,2,,
2,3,1,3995,D,6,2,,
3,5,1,6374,Q,4,1,0.0,30.0
4,6,1,3995,D,6,2,,


## Associate procedures with the new ID column

In [13]:
proc2022 = procedimientos2022.merge(IDs_2022, how = 'inner', on = 'ID')
proc2022.head()

Unnamed: 0,ID,NUMPROMED,PROMED,TIPO,ANEST,QUIROF,QH,QM,ID2
0,1028842,1,9199,D,6,2,,,298043
1,1028842,2,8741,D,6,2,,,298043
2,1028842,3,9394,T,6,2,,,298043
3,1028842,4,9032,D,6,2,,,298043
4,1028843,1,9925,T,6,2,,,298044


## Keep variables of interest

In [14]:
v_keep = ['ID2', 'CLUES', 'EGRESO', 'INGRE', 'DIAS_ESTA', 'CVEEDAD','EDAD', 'NACIOEN', 'SEXO', 'PESO', 'TALLA', 'DERHAB',
         'MUNIC', 'LOC', 'TIPSERV', 'PROCED', 'MOTEGRE', 'DIAG_INI', 'AFECPRIN', 'VEZ', 'INFEC', 'CAUSAEXT']
egresos2022 = egresos2022[v_keep]

In [15]:
#Drop original ID column from Affections
afec2022.drop(['ID'], axis = 1, inplace = True)

In [16]:
#Drop original ID column from Procedures
proc2022.drop(['ID'], axis = 1, inplace = True)

## Export data to CSV

In [17]:
from pathlib import Path  
#Discharges
filepath = Path('../../Data/Databases/Data Integration/egresos2022.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
egresos2022.to_csv(filepath, index = False)

In [18]:
#Affections
filepath = Path('../../Data/Databases/Data Integration/afecciones2022.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
afec2022.to_csv(filepath, index = False)

In [19]:
#Procedures
filepath = Path('../../Data/Databases/Data Integration/procedimientos2022.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
proc2022.to_csv(filepath, index = False)