# Importação das bibliotecas usadas

In [33]:
import pandas as pd

# Leitura dos dados

In [34]:
covidData = pd.read_csv('covid19-data.csv')

# Exploração dos dados

In [35]:
covidData.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97


## Colunas do dataset
- sex: 1 for female and 2 for male.
- age: of the patient.
- classification: covid test findings. Values 1-3 mean that the patient was diagnosed with covid in different
- degrees. 4 or higher means that the patient is not a carrier of covid or that the test is inconclusive.
- patient type: type of care the patient received in the unit. 1 for returned home and 2 for hospitalization.
- pneumonia: whether the patient already have air sacs inflammation or not.
- pregnancy: whether the patient is pregnant or not.
- diabetes: whether the patient has diabetes or not.
- copd: Indicates whether the patient has Chronic obstructive pulmonary disease or not.
- asthma: whether the patient has asthma or not.
- inmsupr: whether the patient is immunosuppressed or not.
- hypertension: whether the patient has hypertension or not.
- cardiovascular: whether the patient has heart or blood vessels related disease.
- renal chronic: whether the patient has chronic renal disease or not.
- other disease: whether the patient has other disease or not.
- obesity: whether the patient is obese or not.
- tobacco: whether the patient is a tobacco user.
- usmr: Indicates whether the patient treated medical units of the first, second or third level.
- medical unit: type of institution of the National Health System that provided the care.
- intubed: whether the patient was connected to the ventilator.
- icu: Indicates whether the patient had been admitted to an Intensive Care Unit.
- date died: If the patient died indicate the date of death, and 9999-99-99 otherwise.

In [36]:
covidData.columns

Index(['USMER', 'MEDICAL_UNIT', 'SEX', 'PATIENT_TYPE', 'DATE_DIED', 'INTUBED',
       'PNEUMONIA', 'AGE', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR',
       'HIPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY',
       'RENAL_CHRONIC', 'TOBACCO', 'CLASIFFICATION_FINAL', 'ICU'],
      dtype='object')

## Descrição das colunas 

In [37]:
covidData.describe()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,1.632194,8.980565,1.499259,1.190765,79.52288,3.346831,41.7941,49.76558,2.186404,2.260569,2.242626,2.298132,2.128989,2.435143,2.26181,2.125176,2.25718,2.214333,5.305653,79.55397
std,0.4822084,3.723278,0.4999997,0.3929041,36.86889,11.91288,16.90739,47.51073,5.424242,5.132258,5.114089,5.462843,5.236397,6.646676,5.19485,5.175445,5.135354,5.323097,1.881165,36.82307
min,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,4.0,1.0,1.0,97.0,2.0,30.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,97.0
50%,2.0,12.0,1.0,1.0,97.0,2.0,40.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,6.0,97.0
75%,2.0,12.0,2.0,1.0,97.0,2.0,53.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,97.0
max,2.0,13.0,2.0,2.0,99.0,99.0,121.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,7.0,99.0


# 4 - Mostre a distribuição de frequência de todas as variáveis (frequência e porcentagem)

In [38]:
distributions = {}

for col in covidData.columns:
    freq = covidData[col].value_counts()
    percent = (freq / freq.sum()) * 100
    
    temporaryDataFrame = pd.DataFrame({'Frequência': freq, 'Porcentagem': percent})
  
    distributions[col] = temporaryDataFrame

distributionsData = pd.concat(distributions, axis=1)
distributionsData

Unnamed: 0_level_0,USMER,USMER,MEDICAL_UNIT,MEDICAL_UNIT,SEX,SEX,PATIENT_TYPE,PATIENT_TYPE,DATE_DIED,DATE_DIED,...,OBESITY,OBESITY,RENAL_CHRONIC,RENAL_CHRONIC,TOBACCO,TOBACCO,CLASIFFICATION_FINAL,CLASIFFICATION_FINAL,ICU,ICU
Unnamed: 0_level_1,Frequência,Porcentagem,Frequência,Porcentagem,Frequência,Porcentagem,Frequência,Porcentagem,Frequência,Porcentagem,...,Frequência,Porcentagem,Frequência,Porcentagem,Frequência,Porcentagem,Frequência,Porcentagem,Frequência,Porcentagem
2,662903.0,63.219417,169.0,0.016117,523511.0,49.925947,200031.0,19.076461,,,...,885727.0,84.469590,1026665.0,97.910498,960979.0,91.646186,1851.0,0.176525,175685.0,16.754643
1,385672.0,36.780583,151.0,0.014400,525064.0,50.074053,848544.0,80.923539,,,...,159816.0,15.241256,18904.0,1.802828,84376.0,8.046730,8601.0,0.820256,16858.0,1.607706
12,,,602995.0,57.506139,,,,,,,...,,,,,,,,,,
4,,,314405.0,29.984026,,,,,,,...,,,,,,,3122.0,0.297737,,
6,,,40584.0,3.870396,,,,,,,...,,,,,,,128133.0,12.219727,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,,,,,,,,,,,...,,,,,,,,,,
116,,,,,,,,,,,...,,,,,,,,,,
111,,,,,,,,,,,...,,,,,,,,,,
121,,,,,,,,,,,...,,,,,,,,,,


# 5 - Identifique os tipos de dados para cada variável (categórico, numérico...)

In [39]:
import pandas as pd

temporaryData = []

for col in covidData.columns:
    dataType = covidData[col].dtype
    temporaryData.append({'Variável': col, 'Tipo de dado': dataType})

typesData = pd.DataFrame(temporaryData)
typesData

Unnamed: 0,Variável,Tipo de dado
0,USMER,int64
1,MEDICAL_UNIT,int64
2,SEX,int64
3,PATIENT_TYPE,int64
4,DATE_DIED,object
5,INTUBED,int64
6,PNEUMONIA,int64
7,AGE,int64
8,PREGNANT,int64
9,DIABETES,int64
