# 환자의 의료 비용 예측하기

목표 : 학습할 데이터를 토앟여 예측할 데이터에서 환자별 의료 비용을 예측하려고 한다.


# 0.라이브러리 불러오기 및 데이터 준비하기

In [1]:
# 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'

In [3]:
# 학습할 데이터
path = '../data/medical_data.csv'
data1 = pd.read_csv(path)
# 예측할 데이터
path = '../data/medical_sample.csv'
data2 = pd.read_csv(path)

# 1. 데이터 파악하기

In [5]:
# 학습할 데이터
data1

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,41,female,31.600,0,no,southwest,6186.1270
1,30,male,25.460,0,no,northeast,3645.0894
2,18,female,30.115,0,no,northeast,21344.8467
3,61,female,29.920,3,yes,southeast,30942.1918
4,34,female,27.500,1,no,southwest,5003.8530
...,...,...,...,...,...,...,...
1233,50,male,30.970,3,no,northwest,10600.5483
1234,18,female,31.920,0,no,northeast,2205.9808
1235,18,female,36.850,0,no,southeast,1629.8335
1236,21,female,25.800,0,no,southwest,2007.9450


In [6]:
# 예측할 데이터
data2

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
95,28,female,37.620,1,no,southeast
96,54,female,30.800,3,no,southwest
97,55,male,38.280,0,no,southeast
98,56,male,19.950,0,yes,northeast


# 2. 데이터 전처리

**1) 가변수화**

In [7]:
# 가변수화
dumm_cols = ['sex','smoker','region']
data1 = pd.get_dummies(data1, columns=dumm_cols, drop_first=True)

# 데이터 확인
data1.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,41,31.6,0,6186.127,0,0,0,0,1
1,30,25.46,0,3645.0894,1,0,0,0,0
2,18,30.115,0,21344.8467,0,0,0,0,0
3,61,29.92,3,30942.1918,0,1,0,1,0
4,34,27.5,1,5003.853,0,0,0,0,1


**2) x, y 분리**

In [9]:
target = 'charges'

x = data1.drop(target, axis=1)
y = data1.loc[:,target]

**3) 학습용, 평가용 데이터 분리**

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=1)

# 3. 모델링

- 의료비용이기 때문에 회귀 문제

In [12]:
# 불러오기
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [13]:
# 선언하기
model = DecisionTreeRegressor(max_depth=5, random_state=1)

In [14]:
# 학습하기
model.fit(x_train, y_train)

In [17]:
# 예측하기
y_pred = model.predict(x_test)

In [18]:
# 평가하기
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

MAE: 2832.8122955549356
R2: 0.8402002578889749


# 4. 일반화된 성능

In [19]:
# 불러오기
from sklearn.model_selection import cross_val_score

# 성능 예측
cv_score = cross_val_score(model, x_train, y_train, cv=5)

# 결과
print(cv_score)
print(f"평균 : {cv_score.mean()}")

[0.809966   0.85992378 0.75384297 0.82846604 0.82511732]
평균 : 0.8154632251864801


# 5. 성능 튜닝
- Grid Search로 최적의 하이퍼파라미터 찾기

In [20]:
# 불러오기
from sklearn.model_selection import GridSearchCV

# 기본 모델 선언
model_tree = DecisionTreeRegressor(random_state=1)

# 파라미터 선언
param = {'max_depth':range(1,51)}

# 모델 선언
model = GridSearchCV(model_tree,
                    param,
                    cv=5,)

In [22]:
# 학습하기
model.fit(x_train, y_train)

In [26]:
# 예측하기
y_val_pred = model.predict(x_test)
y_val_pred[:5]

array([47041.18743111,  9994.46544896, 12239.51234896, 19604.57335733,
       14807.50808773])

In [27]:
# 성능 검증하기
print(model.best_params_) # 최고의 파라미트
print(model.best_score_) # 최고 점수
print(r2_score(y_test, y_pred)) # r2 점수

{'max_depth': 4}
0.8317266890075053
0.8402002578889749


# 6. 최종 예측 및 평가

In [28]:
# 예측할 데이터 확인
data2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [29]:
# 예측할 데이터 가변수화
dumm_cols = ['sex', 'smoker', 'region']
data2 = pd.get_dummies(data2, columns=dumm_cols, drop_first=True)

# 확인
data2.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.88,0,1,0,1,0,0


In [31]:
# 예측하기
x_test = data2
y_pred = model.predict(x_test)

# 확인
y_pred[:10]

array([16055.05542222,  5692.67983655,  5692.67983655,  3509.62049904,
        3509.62049904,  3509.62049904,  9994.46544896,  7165.0598284 ,
        7165.0598284 , 14807.50808773])

In [34]:
# 예측할 데이터불러오기
path = '../data/medical_sample.csv'
final = pd.read_csv(path)

# 예측된 값 넣어주기
final['charges']=y_pred.round(2)

# 확인
final

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16055.06
1,18,male,33.770,1,no,southeast,5692.68
2,28,male,33.000,3,no,southeast,5692.68
3,33,male,22.705,0,no,northwest,3509.62
4,32,male,28.880,0,no,northwest,3509.62
...,...,...,...,...,...,...,...
95,28,female,37.620,1,no,southeast,5692.68
96,54,female,30.800,3,no,southwest,12239.51
97,55,male,38.280,0,no,southeast,12239.51
98,56,male,19.950,0,yes,northeast,28117.46
