# Table data

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
import os, csv

## 1. Generate `all.csv`

The `all.csv` dataset contains all the tables from the `full_1634313571272.xlsx` excel file joined together. Some of tables in this excel file have the same name, but different meanings. To distinguish them, every column of the `all.csv` has a number next to it, represeting the number of its corresponding table.

In [6]:
def get_all():
    all_tables = pd.DataFrame({})
    i = 1
    for file in sorted(os.listdir("../table-data/data/CSVs")):
        if file != "data-merged.csv":
            table = pd.read_csv("../table-data/data/CSVs/"+file)
            for col in table.columns:
                all_tables[f"{col}-{i}"] = table[col]
        i += 1
    return all_tables

all_tables = get_all()
# all_tables.to_csv("all.csv", index = False)
all_tables

Unnamed: 0,idEstudo-1,idProcessoLocal-1,numRegistoGeral-1,numRegistoHospital-1,iniciaisNome-1,dataRegInicial-1,dataNascimento-1,altura-1,peso-1,profissao-1,...,ret-32,atCal-32,evo-32,ligA-32,ligB-32,ligC-32,ligD-32,ligE-32,hab-32,total-32
0,1.10,1876470,1,1,CHSP,2017-10-11 14:16:08.014000,1938-01-30 00:00:00,167,74,Reformado - Guarda prisional,...,3,5,3,2,1,3,1,1,1,30
1,1.20,2415798,2,2,NLMMF,2017-10-16 17:38:28.873000,1959-04-27 22:59:59.997000,185,86,Professor universitário,...,3,5,3,2,1,3,1,1,1,30
2,6.10,17044593,3,1,MFSMF,2017-11-06 11:14:46.824000,1938-05-25 22:59:59.997000,150,45,,...,0,0,0,0,0,0,0,0,0,0
3,6.20,17044640,4,2,NRM,2017-11-06 14:43:07.634000,1932-10-13 22:59:59.997000,160,60,reformada,...,,,,,,,,,,
4,6.30,17044934,5,3,JVDN,2017-11-08 11:54:00.602000,1946-11-04 00:00:00,165,80,psicólogo,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,6.16,5032101,836,16,FMFL,2021-02-09 15:10:33.390000,1952-06-13 22:59:59.997000,160,62,médico,...,0,0,0,0,0,0,0,0,0,0
836,6.17,18027449,837,17,MJBPFG,2021-02-09 15:28:57.858000,1938-11-22 00:00:00,161,63,reformada,...,,,,,,,,,,
837,6.18,18027709,838,18,BJE,2021-02-09 16:04:47.076000,1968-04-15 22:59:59.997000,170,95,biologo marinho,...,0,0,0,0,0,0,0,0,0,0
838,6.19,18027953,839,19,LAPHS,2021-02-10 12:25:16,1934-03-09 00:00:00,178,90,reformado,...,,,,,,,,,,


## 2. Filter `all.csv`

We are only interested in some of the variables from `all.csv`. Since the target variable is the 3 month rankin score, all the variables collected after this moment are discarded. Additionally, variables that directly identify the patient (like its name initials) are discarded. 

The following is the list of variables we want to keep for each of the tables whose content was recorded before the 3 month mark.

In [12]:
tables = {}
tables["01-GERAL"]                                 = ["numRegistoGeral-1", "idProcessoLocal-1", "dataNascimento-1", 
                                                      "altura-1", "peso-1", "genero-1", "etnia-1", "viveSozinho-1", 
                                                      "escola-1"]
tables["02-ANTECEDENTES-PESSOAIS"]                 = ["hiperTen-2", "diabetes-2", "dislip-2", "avcIsqPre-2", 
                                                      "avcHemPre-2", "aitPre-2", "apnSono-2", "insCard-2", 
                                                      "cardIsq-2", "doeArt-2", "infecao-2", "histFam-2", 
                                                      "estMenP-2", "tabag-2", "alcool-2", "rankin-2"]
tables["04-CARACTERIZACAO-AVC"]                    = ["dataAVC-4", "avcAcordar-4", "instAVCpre-4", "defMaxIn-4", 
                                                      "melSintIn-4", "cefaleia-4", "criseEpi-4", "hemoAd-4", 
                                                      "hemat-4", "inrAd-4", "gliceAd-4"]
tables["05-CARACTERIZACAO-AVC - NIHSS"]            = ["afasia-5", "loc-5", "com-5", "ling-5", "paresia-5", "face-5", 
                                                      "memSE-5", "memSD-5", "memIE-5", "memID-5", "altCons-5", 
                                                      "altMovOc-5", "altVis-5", "atax-5", "hipos-5", "disartia-5", 
                                                      "inatencao-5", "totalNIHSS-5", "disf-5", "incUri-5", 
                                                      "preArtSis-5", "preArtDia-5", "ritmoCardECG-5"]
tables["07-IMAGEM INICIAL - TCCE"]                 = ["aspects-7", "ouTerrIsq-7", "ouTerrIsqL-7", "sinCorda-7", 
                                                      "lacAnt-7", "lacAntL-7", "leucoa-7", "enfAnt-7", "enfAntL-7"]
tables["08-IMAGEM INICIAL - AngioTCCE"]            = ["colaCTA1-8", "colaCTA2a-8", "colaCTA2b-8"]
tables["09-IMAGEM INICIAL - AngioTCCE - Estenose"] = ["ocEst-9", "localiz-9", "lado-9"]
tables["10-IMAGEM INICIAL - AngioTCCE - Oclusao"]  = ["ocEst-10", "localiz-10", "lado-10"]
tables["11-TRATAMENTO AGUDO AVC"]                  = ["rtPA-11", "tromb-11", "recaTIC-11"]
tables["15-AVALIACAO CLINICA 24H - NIHSS"]         = ["afasia-15", "loc-15", "com-15", "ling-15", "paresia-15", 
                                                      "face-15", "memSE-15", "memSD-15", "memIE-15", "memID-15", 
                                                      "altCons-15", "altMovOc-15", "altVis-15", "atax-15", 
                                                      "hipos-15", "disartia-15", "inatencao-15", "totalNIHSS-15", 
                                                      "disf-15", "incUri-15", "preArtSis-15", "preArtDia-15", 
                                                      "ritmoCardECG-15"]
tables["18-ALTA"]                                  = ["destino-18", "rankin-18"]
tables["19-ALTA - NIHSS"]                          = ["afasia-19", "loc-19", "com-19", "ling-19", "paresia-19", 
                                                      "face-19", "memSE-19", "memSD-19", "memIE-19", "memID-19", 
                                                      "altCons-19", "altMovOc-19", "altVis-19", "atax-19", 
                                                      "hipos-19", "disartia-19", "inatencao-19", "totalNIHSS-19", 
                                                      "disf-19", "incUri-19", "preArtSis-19", "preArtDia-19", 
                                                      "ritmoCardECG-19"]
tables["20-ALTA - MOCA"]                           = ["capVis-20", "nomea-20", "atencao-20", "ling-20", "abst-20", 
                                                      "evoDif-20", "orien-20", "total-20"]
tables["21-ALTA - HADS"]                           = ["valorP1-21", "valorP2-21", "valorP3-21", "valorP4-21", 
                                                      "valorP5-21", "valorP6-21", "valorP7-21", "valorP8-21", 
                                                      "valorP9-21", "valorP10-21", "valorP11-21", "valorP12-21", 
                                                      "valorP13-21", "valorP14-21", "totalA-21", "totalD-21"]
tables["22-ALTA - MINI MENTAL STATE"]              = ["ori1-22", "ori2-22", "ret-22", "atCal-22", "evo-22", 
                                                      "ligA-22", "ligB-22", "ligC-22", "ligD-22", "ligE-22", 
                                                      "hab-22", "total-22"]
tables["23-AVALIACAO 3 MESES"]                     = ["rankin-23"]

Now we can filter the `all.csv` using these variables list:

In [13]:
variables       = [v for t in tables for v in tables[t]]
tables_filtered = all_tables[variables]
print(f"Using {len(variables)} variables")
tables_filtered

Using 165 variables


Unnamed: 0,numRegistoGeral-1,idProcessoLocal-1,dataNascimento-1,altura-1,peso-1,genero-1,etnia-1,viveSozinho-1,escola-1,hiperTen-2,...,atCal-22,evo-22,ligA-22,ligB-22,ligC-22,ligD-22,ligE-22,hab-22,total-22,rankin-23
0,1,1876470,1938-01-30 00:00:00,167,74,2,1,1,2,1,...,5,2,2,1,0,1,0,0,24,2
1,2,2415798,1959-04-27 22:59:59.997000,185,86,2,1,0,4,1,...,5,3,2,1,3,1,1,1,30,0
2,3,17044593,1938-05-25 22:59:59.997000,150,45,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,6
3,4,17044640,1932-10-13 22:59:59.997000,160,60,1,1,0,2,1,...,0,0,0,0,0,0,0,0,0,
4,5,17044934,1946-11-04 00:00:00,165,80,2,1,0,4,1,...,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,836,5032101,1952-06-13 22:59:59.997000,160,62,2,1,1,4,0,...,0,0,0,0,0,0,0,0,0,0
836,837,18027449,1938-11-22 00:00:00,161,63,1,1,0,2,0,...,0,0,0,0,0,0,0,0,0,0
837,838,18027709,1968-04-15 22:59:59.997000,170,95,2,1,0,4,1,...,0,0,0,0,0,0,0,0,0,
838,839,18027953,1934-03-09 00:00:00,178,90,2,1,1,3,1,...,0,0,0,0,0,0,0,0,0,2


## 3. Add `NCCT` and `CTA` columns

Add these two columns that when they are 1, its corresponding patient has the corresponding CT scan and when they are 0, they don't.

In [22]:
nccts = [file[:-4] for file in os.listdir("../../data/gravo/NCCT") if file.endswith(".nii")]
ctas  = [file[:-4] for file in os.listdir("../../data/gravo/CTA") if file.endswith(".nii")]
has_ncct = []
has_cta  = []

for _, row in tables_filtered.iterrows():
    patient_id = row["idProcessoLocal-1"]
    has_ncct.append( int(patient_id in nccts) )
    has_cta. append( int(patient_id in ctas) )
    
tables_filtered["NCCT"] = has_ncct
tables_filtered["CTA"]  = has_cta
tables_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,numRegistoGeral-1,idProcessoLocal-1,dataNascimento-1,altura-1,peso-1,genero-1,etnia-1,viveSozinho-1,escola-1,hiperTen-2,...,ligA-22,ligB-22,ligC-22,ligD-22,ligE-22,hab-22,total-22,rankin-23,NCCT,CTA
0,1,1876470,1938-01-30 00:00:00,167,74,2,1,1,2,1,...,2,1,0,1,0,0,24,2,1,1
1,2,2415798,1959-04-27 22:59:59.997000,185,86,2,1,0,4,1,...,2,1,3,1,1,1,30,0,1,1
2,3,17044593,1938-05-25 22:59:59.997000,150,45,1,1,0,1,0,...,0,0,0,0,0,0,0,6,0,0
3,4,17044640,1932-10-13 22:59:59.997000,160,60,1,1,0,2,1,...,0,0,0,0,0,0,0,,0,0
4,5,17044934,1946-11-04 00:00:00,165,80,2,1,0,4,1,...,0,0,0,0,0,0,0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,836,5032101,1952-06-13 22:59:59.997000,160,62,2,1,1,4,0,...,0,0,0,0,0,0,0,0,0,0
836,837,18027449,1938-11-22 00:00:00,161,63,1,1,0,2,0,...,0,0,0,0,0,0,0,0,0,0
837,838,18027709,1968-04-15 22:59:59.997000,170,95,2,1,0,4,1,...,0,0,0,0,0,0,0,,0,0
838,839,18027953,1934-03-09 00:00:00,178,90,2,1,1,3,1,...,0,0,0,0,0,0,0,2,0,0
