# Data Preprocessing 

- Combine two datasets(symptom & comorbidity)

- Delete patients records that contains missing values

- Transform AGE_YRS to AGE_GRP

In [1]:
import pandas as pd
import numpy as np
import re

In [4]:
covid_adv_sym = pd.read_csv('dataset/covid_adv_symptom.csv')
covid_adv_co = pd.read_csv('dataset/covid_adv_comorbidity.csv')

### Combine datasets

In [5]:
covid_adv = pd.merge(covid_adv_sym, covid_adv_co, how = 'left', on = ['VAERS_ID', 'AGE_YRS',
                                                                      'SEX', 'DIED', 'VAX_MANU'])

In [10]:
covid_adv

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,DIED,VAX_MANU,Headache,Pyrexia,Fatigue,Chills,Swelling,...,Depression_c,Hypothyroidsm_c,High_Cholesterol_c,Heart_Disease_c,GERD_c,Cancer_c,Obesity_c,Migraine_c,Kidney_Disease_c,COVID19_Positive
0,1410490,54.0,F,No,PFIZER\BIONTECH,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,1413866,67.0,M,No,MODERNA,No,No,No,No,Yes,...,No,No,No,No,No,No,No,No,No,No
2,896636,47.0,F,No,MODERNA,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
3,902418,56.0,F,No,PFIZER\BIONTECH,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
4,902440,35.0,F,No,PFIZER\BIONTECH,No,No,Yes,No,No,...,No,No,No,No,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404844,1427468,17.0,F,No,PFIZER\BIONTECH,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
404845,1427471,18.0,M,No,MODERNA,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
404846,1427472,54.0,F,No,MODERNA,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
404847,1427475,87.0,F,Yes,MODERNA,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No


### Missing Value 

- Age -> Delete from 404849 to 378693

- Gender -> Delete (Unknown) to 376566

- Vaccine manufacturer -> Delete(UNKNOWN MANUFACTURER) to 375804

In [12]:
# Delete rows with missing age
covid_adv = covid_adv.dropna(subset=['AGE_YRS'])

In [13]:
# Delete rows with SEX == 'Unknown'
covid_adv = covid_adv[covid_adv.SEX != 'U']

In [14]:
# Delete rows with VAX_MANU == 'Unknown'
covid_adv = covid_adv[covid_adv.VAX_MANU != 'UNKNOWN MANUFACTURER']

### Transform Variables 

- Age YRS to Age GRP 

In [15]:
# Age
age_group = []
for age in covid_adv['AGE_YRS']:
    if age in range(0, 18):
        age_group.append('0-17')
    elif age in range(18, 35):
        age_group.append('18-34')
    elif age in range(35, 50):
        age_group.append('35-49')
    elif age in range(50, 65):
        age_group.append('50-64')
    elif age in range(65, 80):
        age_group.append('65-79')
    else:
        age_group.append('>= 80')
        
# create new variable        
covid_adv['AGE_GRP'] = age_group

In [16]:
covid_adv

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,DIED,VAX_MANU,Headache,Pyrexia,Fatigue,Chills,Swelling,...,Hypothyroidsm_c,High_Cholesterol_c,Heart_Disease_c,GERD_c,Cancer_c,Obesity_c,Migraine_c,Kidney_Disease_c,COVID19_Positive,AGE_GRP
0,1410490,54.0,F,No,PFIZER\BIONTECH,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,50-59
1,1413866,67.0,M,No,MODERNA,No,No,No,No,Yes,...,No,No,No,No,No,No,No,No,No,60-69
2,896636,47.0,F,No,MODERNA,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,40-49
3,902418,56.0,F,No,PFIZER\BIONTECH,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,50-59
4,902440,35.0,F,No,PFIZER\BIONTECH,No,No,Yes,No,No,...,No,No,No,No,No,No,No,No,No,30-39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404844,1427468,17.0,F,No,PFIZER\BIONTECH,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,10-19
404845,1427471,18.0,M,No,MODERNA,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,10-19
404846,1427472,54.0,F,No,MODERNA,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,50-59
404847,1427475,87.0,F,Yes,MODERNA,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,80-89


In [17]:
covid_adv.to_csv('covid_adv.csv', index = False)