<a href="https://colab.research.google.com/github/Gyeong-Hyeon/AI_Project/blob/main/Section2/AI_01_%EC%97%BC%EA%B2%BD%ED%98%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade category_encoders
!pip install eli5
!pip install pdpbox
!pip install shap

In [None]:
from IPython.display import display
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from category_encoders import TargetEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score, roc_curve, roc_auc_score
import eli5
from eli5.sklearn import PermutationImportance
from pdpbox.pdp import pdp_isolate, pdp_plot, pdp_interact, pdp_interact_plot
import shap
import warnings

#**1. 데이터 전처리 및 랭글링** 

##**1) 데이터 준비하기**

In [None]:
#데이터를 불러옵니다.
patient_21 = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vQDLQ5Bdj7692BU0e1EwsEyG2h5cpyt8pcrXOQ3uOQixIQ4Qbrn5d6jX5WnxeNpWr3mrECiXgPWfkWf/pub?output=csv')
vacc_21 = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vTr60lC-MNfPA_PgY7NjtFPuV_uEgN5uETLUSJkeoruTZyluDhxzY8D0JVF4bijkbFJv6E2QNC2zZku/pub?output=csv')
patient_20 = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vSaR-9rRkFWSJYTRsI6KLXOBANnOpt5oNyJLd8rhnBE7dWslQ2QP69E5bfUO-cfXkdn2Elpvi549RcF/pub?output=csv')
vacc_20 = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vSbKt4I1BVWLqB6FyWAfcWr7lCFctvWd4b4C7YHNtcA9znBT-cs15Q1NM7ToBs5aczuX2_hK28KO6wA/pub?output=csv')

#2020년 환자 데이터에 'Unnamed'라는 공란 컬럼을 삭제하겠습니다.
patient_20 = patient_20.drop(columns=[col for col in patient_20.columns if 'Unnamed' in col])

#2020년과 2021년 데이터를 합쳐줍니다.
patient = pd.concat([patient_20,patient_21])
vacc = pd.concat([vacc_20,vacc_21])

dfs = [patient, vacc]

for df in dfs:
      print(df.shape)
      display(df.head())
      print('\n')

* Patient: 환자의 정보가 담긴 데이터 셋
* Vacc: 백신 정보가 담긴 데이터 셋

💊VAERS_ID (고유 번호)를 기준으로 두 데이터를 합칠 수 있습니다.
각 데이터별로 중복된 ID가 있는지 먼저 확인 해보겠습니다.

In [62]:
#백신 데이터를 Covid 백신으로만 소팅합니다

mask = vacc['VAX_TYPE'].isin(['COVID19'])
covid_vac = vacc[mask]
print('Length of Covid vaccine dataset:',len(covid_vac))

#Covid 백신 데이터 셋의 VAERS_ID열에 중복된 데이터가 있는지 확인합니다.
print('Duplicated VAERS ID in Covid vaccine dataset:',covid_vac.duplicated(['VAERS_ID']).sum())
mask = covid_vac.duplicated(['VAERS_ID'])
covid_duplicated = covid_vac[mask]
covid_duplicated['VAERS_ID'].unique()

Length of Covid vaccine dataset: 14337
Duplicated VAERS ID in Covid vaccine dataset: 78


array([ 905340,  906428,  907330,  907837,  909370,  909520,  911085,
        912442,  912896,  913038,  913869,  914017,  914458,  937480,
        938126,  938576,  943614,  944595,  945504,  946663,  948418,
        949732,  950911,  957227,  959928,  962110,  963587,  964617,
        967274,  968195,  970198,  970515,  971567,  971939,  973816,
        974177,  975206,  978768,  983425,  984929,  988246,  989556,
        990109,  990118,  990694,  990702,  990718,  991686,  992082,
        992774,  994007,  995346,  995419,  996577, 1000418, 1000733,
       1000849, 1003553, 1005737, 1006745, 1007357, 1007628, 1007928,
       1008767, 1009424, 1011689, 1011707, 1011983, 1015253, 1015465,
       1015921, 1016770, 1016907, 1019670, 1020144, 1020227, 1022397,
       1024343])

* **백신 데이터 중복 VAERS_ID:** 46개
* **중복되는 경우**

1. 같은 환자에게 다른 종류의 백신 접종 ⭐

2. 같은 환자에게 다른 Lot의 백신 접종 → drop

3. 백신 루트나 백신 접종 지역이 다름 → drop

In [63]:
ids = [905340, 906428, 907330, 907837, 909370, 909520, 911085, 912442, 912896, 913038, 913869, 914017, 914458, 937480, 938126, 938576, 943614, 944595, 945504, 946663, 948418, 949732, 950911, 957227,
       959928, 962110, 963587, 964617, 967274, 968195, 970198, 970515, 971567, 971939, 973816, 974177, 975206, 978768, 983425, 984929, 988246, 989556, 991686, 992082, 992774, 995419]

for id in ids:
  mask = covid_vac['VAERS_ID'].isin([id])
  mask = covid_vac[mask]
  if len(mask['VAX_NAME'].unique()) == 1:
    pass
  else:
    print(mask[['VAERS_ID', 'VAX_DOSE_SERIES','VAX_NAME']],'\n')

       VAERS_ID VAX_DOSE_SERIES                             VAX_NAME
51963    912896               2          COVID19 (COVID19 (MODERNA))
51964    912896               1  COVID19 (COVID19 (PFIZER-BIONTECH)) 

      VAERS_ID VAX_DOSE_SERIES                             VAX_NAME
2524    967274             UNK          COVID19 (COVID19 (MODERNA))
2525    967274             UNK  COVID19 (COVID19 (PFIZER-BIONTECH)) 



* 같은 백신을 투약한 VAERS_ID: 44개 (drop)

* 다른 백신을 투약한 VAERS_ID: 912896, 967274

In [64]:
duplicated = patient[patient['VAERS_ID'].isin([912896,967274])]
for id,symp in zip(duplicated['VAERS_ID'],duplicated['SYMPTOM_TEXT']):
  print('Patient#',id, symp)

Patient# 912896 patient received a dose of Moderna vaccine after receiving the Pfizer vaccine.
Patient# 967274 I was pregnant and my baby died two days after I took it and I got really sick


912896은 접종 후에 대한 증상이나 사망 여부 등 특별한 정보가 없으므로 삭제하겠습니다.

967274는 태아가 죽은 것으로 확인 되나, 모더나와 화이자 중 어느 백신의 영향인지 알 수 없으므로 삭제하겠습니다.

In [65]:
print('Duplicated VAERS ID in Patient dataset:',patient.duplicated(['VAERS_ID']).sum())

Duplicated VAERS ID in Patient dataset: 0


In [66]:
covid = covid_vac[['VAERS_ID','VAX_MANU']]
covid = covid.drop_duplicates(['VAERS_ID'])
covid = pd.merge(covid, patient, on = ['VAERS_ID'], how = 'inner')
print('기존 환자 수:',len(patient),'\nCovid백신을 접종한 환자 수:',len(covid))

기존 환자 수: 14885 
Covid백신을 접종한 환자 수: 14234


##**2) 데이터 전처리**


1. 필요 없는 column drop: VAERS_ID, RECVDATE, STATE, CAGE_YR, CAGE_MO, RPT_DATE, DATEDIED, ER_VISIT, HOSPITAL, HOSPDAYS, LAB_DATA, 

2. 결측치 정리



In [51]:
covid.isnull().sum()

VAERS_ID            0
VAX_MANU            0
RECVDATE           25
STATE            1537
AGE_YRS          1107
CAGE_YR          2897
CAGE_MO         12597
SEX                25
RPT_DATE        12462
SYMPTOM_TEXT       25
DIED            12012
DATEDIED        12068
L_THREAT        12160
ER_VISIT        12608
HOSPITAL        11257
HOSPDAYS        11743
X_STAY          12619
DISABLE         12427
RECOVD           1058
VAX_DATE          493
ONSET_DATE        590
NUMDAYS           932
LAB_DATA         6414
V_ADMINBY          37
V_FUNDBY        12443
OTHER_MEDS       4939
CUR_ILL          6383
HISTORY          4259
PRIOR_VAX       12126
SPLTTYPE         9419
FORM_VERS          41
TODAYS_DATE       298
BIRTH_DEFECT    12610
OFC_VISIT       10980
ER_ED_VISIT      9869
ALLERGIES        5235
dtype: int64

In [67]:
previous = covid['AGE_YRS'].isnull().sum()
covid['AGE_YRS'] = np.where(pd.notnull(covid['AGE_YRS']) == True, covid['AGE_YRS'], ['CAGE_YR'])
current = covid['AGE_YRS'].isnull().sum()
print('CAGE_YR을 통해', previous-current,'개의 결측값을 채웠으며, 현재 나이 특성의 결측치 갯수는',current,'개입니다.')

CAGE_YR을 통해 1372 개의 결측값을 채웠으며, 현재 나이 특성의 결측치 갯수는 0 개입니다.


In [116]:
covid.SEX.value_counts()

F    10434
M     3354
U      446
Name: SEX, dtype: int64

In [128]:
for symp,sex in zip(covid['SYMPTOM_TEXT'],covid['SEX']):
  if sex == None:
    print(symp)
  else: pass

In [129]:
covid['SEX'].isnull().sum()

0

In [126]:
previous = covid['SEX'].isnull().sum()
covid['split_symp'] = covid.SYMPTOM_TEXT.str.split(' ')
new_sex = []
for symptom, sex in zip(covid['split_symp'],covid['SEX']):
  if sex == None or sex == 'U':
    if 'she' in symptom or 'She' in symptom or 'woman' in symptom:
      new_sex.append('F')
    elif 'he' in symptom or 'He' in symptom or 'man' in symptom:
      new_sex.append('M')
    else:
      new_sex.append(sex)
  else: 
    new_sex.append(sex)

covid['NEW_SEX'] = new_sex
current = covid['NEW_SEX'].isnull().sum()
print(covid.NEW_SEX.value_counts(),'\n 증상 컬럼을 통해',previous-current,'개의 결측치를 채웠습니다.')   

F    10470
M     3378
U      386
Name: NEW_SEX, dtype: int64 
 증상 컬럼을 통해 0 개의 결측치를 채웠습니다.


In [115]:
b = []
for value,sex in zip(a[1],a['Sex']):
  if sex == None or sex == 'U':
    if 'she' in value or 'She' in value:
      b.append('W')
    elif 'he' in value or 'He' in value:
      b.append('M')
  else:
    b.append(sex)

a['sex'] = b
a   

Unnamed: 0,1,Sex,2,sex
0,he is a famer,,"[he, is, a, famer]",M
1,she is a farmer,,"[she, is, a, farmer]",W
2,She,W,[She],W
3,He,M,[He],M
4,She,U,[She],W
5,He,U,[He],M
