### 데이터 확인 및 전처리

In [1]:
#고혈압 데이터 로드
import pandas as pd
df = pd.read_csv('./hypertension_data.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,57.0,1.0,3,145,233,1,0,150,0,2.3,0,0,1,1
1,64.0,0.0,2,130,250,0,1,187,0,3.5,0,0,2,1
2,52.0,1.0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56.0,0.0,1,120,236,0,1,178,0,0.8,2,0,2,1
4,66.0,0.0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [2]:
df.shape

(26083, 14)

In [3]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,26083.0,26058.0,26083.0,26083.0,26083.0,26083.0,26083.0,26083.0,26083.0,26083.0,26083.0,26083.0,26083.0,26083.0
mean,55.661389,0.5,0.958594,131.592992,246.246061,0.149753,0.526512,149.655024,0.326573,1.039512,1.400299,0.721849,2.318752,0.547253
std,15.189768,0.50001,1.023931,17.588809,51.643522,0.356836,0.525641,22.858109,0.468969,1.165138,0.616513,1.011608,0.604659,0.497772
min,11.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,44.0,0.0,0.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,0.5,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,67.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,98.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [4]:
df.dtypes

age         float64
sex         float64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [5]:
#결측값 확인
df.isnull().sum()

age          0
sex         25
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
target       0
dtype: int64

In [6]:
#결측값 제거
df = df.dropna(axis=0)
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
df['sex'] = df['sex'].replace([0.0, 1.0], ['female', 'male'])

In [8]:
df['cp'].value_counts()

0    12314
2     7392
1     4456
3     1896
Name: cp, dtype: int64

In [9]:
#'cp'컬럼의 값을 증상 유무로 값 변경(0: 이상 없음, 1: 이상 있음)
df['cp'] = df['cp'].replace([1, 2, 3], 1)
df['cp'].value_counts()

1    13744
0    12314
Name: cp, dtype: int64

In [10]:
df['restecg'].value_counts()

1    13004
0    12702
2      352
Name: restecg, dtype: int64

In [11]:
#'restecg'컬럼의 값을 증상 유무로 값 변경(0: 이상 없음, 1:이상 있음)
df['restecg'] = df['restecg'].replace(2, 1)
df['restecg'].value_counts()

1    13356
0    12702
Name: restecg, dtype: int64

In [12]:
df['slope'].value_counts()

2    12242
1    11990
0     1826
Name: slope, dtype: int64

In [13]:
#'slope'컬럼의 값을 변화 여부로 값 변경(0: 변화 없음, 1: 변화 있음)
df['slope'] = df['slope'].replace(0, 2)
df['slope'] = df['slope'].replace([1, 2], [0, 1])
df['slope'].value_counts()

1    14068
0    11990
Name: slope, dtype: int64

In [14]:
df['ca'].value_counts()

0    15146
1     5514
2     3298
3     1732
4      368
Name: ca, dtype: int64

In [15]:
#'ca'컬럼의 값이 4인 데이터는 결측값을 의미함으로 제거
df = df.drop(index=df[df['ca'] == 4].index, axis=0)

In [16]:
#'ca'컬럼의 값을 증상 유무로 값 변경(0: 이상 없음, 1: 이상 있음)
df['ca'] = df['ca'].replace([1, 2, 3], 1)
df['ca'].value_counts()

0    15146
1    10544
Name: ca, dtype: int64

In [17]:
df['thal'].value_counts()

2    14162
3     9900
1     1474
0      154
Name: thal, dtype: int64

In [18]:
#'thal'컬럼의 값이 0인 데이터는 결측값을 의미함으로 제거
df = df.drop(columns='thal', axis=0)

In [19]:
df.set_index('age').reset_index()
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,target
0,57.0,male,1,145,233,1,0,150,0,2.3,1,0,1
1,64.0,female,1,130,250,0,1,187,0,3.5,1,0,1
2,52.0,male,1,130,204,0,0,172,0,1.4,1,0,1
3,56.0,female,1,120,236,0,1,178,0,0.8,1,0,1
4,66.0,female,0,120,354,0,1,163,1,0.6,1,0,1


In [20]:
df.groupby('target').mean()

Unnamed: 0_level_0,age,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,56.055023,0.244224,134.486394,251.041588,0.156255,0.428547,139.042957,0.549889,1.615044,0.345884,0.666781
1,55.322908,0.758355,129.138817,243.119537,0.141531,0.576835,158.309055,0.142388,0.58349,0.703656,0.196515


In [21]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'target'],
      dtype='object')

In [22]:
#원핫인코딩
df = pd.get_dummies(data=df, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca'])
df.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target,sex_female,sex_male,cp_0,cp_1,fbs_0,fbs_1,restecg_0,restecg_1,exang_0,exang_1,slope_0,slope_1,ca_0,ca_1
0,57.0,145,233,150,2.3,1,0,1,0,1,0,1,1,0,1,0,0,1,1,0
1,64.0,130,250,187,3.5,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0
2,52.0,130,204,172,1.4,1,0,1,0,1,1,0,1,0,1,0,0,1,1,0
3,56.0,120,236,178,0.8,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0
4,66.0,120,354,163,0.6,1,1,0,1,0,1,0,0,1,0,1,0,1,1,0


In [23]:
df.to_csv('./preprocessing_data.csv', index=False)