"Este trabalho utilizou dados disponibilizados pelo repositório COVID-19 Data Sharing/BR, disponível em: “ https://repositoriodatasharingfapesp.uspdigital.usp.br/".



# Pre-processing

In [198]:
import numpy as np
import pandas as pd

In [199]:
data_sources =\
[
   ("dados/raw/added/einstein/dataset_exames.csv", '|'),
   ("dados/raw/added/hsl/hsl_lab_result_1.csv", '|'),
   ("dados/raw/added/fleury/dataset_exames.csv", '|'),
]

dfs_raw = [pd.read_csv(df_file, separator, encoding="latin-1") for df_file, separator in data_sources]

In [200]:
# Checks largest dataset
for df in dfs_raw:
    print(df.shape)

(1853695, 8)
(371357, 9)
(2496591, 8)


In [201]:
df_exame_raw = pd.read_csv("dados/raw/added/fleury/dataset_exames.csv", '|', encoding='latin-1')
df_paciente_raw = pd.read_csv("dados/raw/added/fleury/dataset_pacientes.csv", '|', encoding='latin-1')

In [202]:
print("Shape before removing duplicates:")
print(df_exame_raw.shape)
print(df_paciente_raw.shape)
for df in [df_exame_raw, df_paciente_raw]:
    df.drop_duplicates(inplace=True)
print("Shape after removing duplicates:")
print(df_exame_raw.shape)
print(df_paciente_raw.shape)

Shape before removing duplicates:
(2496591, 8)
(129596, 7)
Shape after removing duplicates:
(2494613, 8)
(129596, 7)


In [203]:
def map_values(x):
    x = x.upper()
    if x == "NÃO DETECTADO" or x == "NÃO DETECTADO (NEGATIVO)" or x == "NÃO REAGENTE" or x == "NEGATIVA":
        return '0'
    elif x == "DETECTADO" or x == "DETECTADO (POSITIVO)" or x == "REAGENTE"  or x == "POSITIVA":
        return '1'
    return x

df_exame_raw.DE_RESULTADO = df_exame_raw.DE_RESULTADO.apply(lambda x: map_values(x))

In [204]:
def is_real(x):
    try:
        a = float(x)
    except:
        return False
    return True

df_exame_raw.DE_RESULTADO = df_exame_raw.DE_RESULTADO.str.replace(",", ".")
df_exame_raw = df_exame_raw[df_exame_raw['DE_RESULTADO'].apply(lambda x: is_real(x))]

In [205]:
# print(' ------ Before Removal ------')
# print("---- Paciente ----")
# for col in df_paciente_raw.columns: print(col)
# print("---- Exame ----")
# for col in df_exame_raw.columns: print(col)
    
del df_paciente_raw['CD_PAIS']
del df_paciente_raw['CD_UF']
del df_paciente_raw['CD_MUNICIPIO']
del df_paciente_raw['CD_CEP']
del df_exame_raw['DE_ORIGEM']

# print(' ------ After Removal ------')
# print("---- Paciente ----")
# for col in df_paciente_raw.columns: print(col)
# print("---- Exame ----")
# for col in df_exame_raw.columns: print(col)

In [206]:
# Filters out meaningless exams
LIMIAR = 0.98
amt_people = df_paciente_raw.shape[0]
df_exame_aux = df_exame_raw\
                .groupby(['DE_EXAME','DE_ANALITO'])\
                .filter(lambda x : len(x)>((1-LIMIAR)*amt_people))
print(df_exame_aux.shape)
for col in df_exame_aux.columns: print(col)

(2060853, 7)
ID_PACIENTE
DT_COLETA
DE_EXAME
DE_ANALITO
DE_RESULTADO
CD_UNIDADE
DE_VALOR_REFERENCIA


In [207]:
# Generates auxiliar csv
exam_count = df_exame_aux.copy()
exam_count = exam_count\
                .groupby(['DE_EXAME','DE_ANALITO', 'DE_RESULTADO'])\
                .size()\
                .reset_index(name="COUNT")
exam_count = exam_count.sort_values(by="COUNT")
exam_count.reset_index()[['DE_EXAME','DE_ANALITO', 'DE_RESULTADO', 'COUNT']].to_csv('teste.csv')

In [1]:
df = df_paciente_raw.merge(df_exame_aux, on='ID_PACIENTE', how='inner')
df.to_csv('training_data')
print(df.shape)
for col in df.columns: print(col)

NameError: name 'df_paciente_raw' is not defined

Neural Network

In [209]:
from sklearn.model_selection import train_test_split

def to_features (data):
    return [np.concatenate([row for row in sample]) for sample in data]

np.random.seed(56789)
def shuffle(X, y):
    # input and output must be shuffled equally
    perm = np.random.permutation(len(X))
    return X[perm], y[perm]

In [211]:
#
# Modelates data to input into NN
#

PCR_df = df.copy()
IGG_df = df.copy()
IGM_df = df.copy()

reference = {"PCR": [("NOVO CORONAVÍRUS 2019 (SARS-CoV-2), DETECÇÃO POR PCR", "Covid 19, Detecção por PCR")],
             "IGM": "a"}