### 제 2유형 연습하기. 당뇨 진척 정도 (회귀)

In [280]:
import pandas as pd 
import numpy as np 

#표준화, 정규화
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 데이터 분리 
from sklearn.model_selection import train_test_split

# 모델 선정 
from sklearn.ensemble import RandomForestRegressor

# 모델 성능평가 관련 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, mean_squared_error, r2_score

from sklearn.datasets import load_diabetes

# diabetes 데이터셋 로드
diabets = load_diabetes()

x=pd.DataFrame(diabets.data, columns=diabets.feature_names)
y = pd.DataFrame(diabets.target)

# 실기시험 데이터 셋팅
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

# y_test는 실제로 주어지는 값 아니므로 무시

x_train = pd.DataFrame(x_train.reset_index())
x_test = pd.DataFrame(x_test.reset_index())
y_train = pd.DataFrame(y_train.reset_index())

x_train.rename(columns={'index':'cust_id'},inplace=True)
x_test.rename(columns={'index' : 'cust_id'},inplace=True)
y_train.columns = ({'cust_id', 'target'})

In [281]:
print(diabets.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

#### 01. 데이터 탐색 (EDA)

In [282]:
# 1. describe로 데이터 확인 
print(x_train.describe().T)
print('--------------------------------------')
print(x_test.describe().T)
print('--------------------------------------')
print(y_train.describe().T)

         count        mean         std       min         25%         50%  \
cust_id  353.0  225.745042  124.277972  1.000000  120.000000  231.000000   
age      353.0    0.001092    0.047456 -0.107226   -0.034575    0.005383   
sex      353.0   -0.001166    0.047544 -0.044642   -0.044642   -0.044642   
bmi      353.0    0.000227    0.048342 -0.090275   -0.035307   -0.007284   
bp       353.0    0.001111    0.047510 -0.112400   -0.033214   -0.005671   
s1       353.0   -0.000216    0.045262 -0.108893   -0.033216   -0.002945   
s2       353.0   -0.001689    0.044502 -0.115613   -0.030437   -0.003819   
s3       353.0    0.002213    0.048880 -0.102307   -0.032356   -0.006584   
s4       353.0   -0.001623    0.046891 -0.076395   -0.039493   -0.002592   
s5       353.0    0.000830    0.047997 -0.126097   -0.033249   -0.001499   
s6       353.0   -0.000198    0.047824 -0.129483   -0.034215   -0.001078   

                75%         max  
cust_id  330.000000  441.000000  
age        0.038076

#### 02. 데이터 전처리  

In [283]:
# 결측치 확인 
print(x_train.isnull().sum())
print('-------')
print(x_test.isnull().sum())
print('-------')
print(y_train.isnull().sum())


cust_id    0
age        0
sex        0
bmi        0
bp         0
s1         0
s2         0
s3         0
s4         0
s5         0
s6         0
dtype: int64
-------
cust_id    0
age        0
sex        0
bmi        0
bp         0
s1         0
s2         0
s3         0
s4         0
s5         0
s6         0
dtype: int64
-------
cust_id    0
target     0
dtype: int64


In [284]:
# info로 데이터 확인하기
print(x_train.info())
print(x_test.info())
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  353 non-null    int64  
 1   age      353 non-null    float64
 2   sex      353 non-null    float64
 3   bmi      353 non-null    float64
 4   bp       353 non-null    float64
 5   s1       353 non-null    float64
 6   s2       353 non-null    float64
 7   s3       353 non-null    float64
 8   s4       353 non-null    float64
 9   s5       353 non-null    float64
 10  s6       353 non-null    float64
dtypes: float64(10), int64(1)
memory usage: 30.5 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  89 non-null     int64  
 1   age      89 non-null     float64
 2   sex      89 non-null     float64
 3   bmi      89 non-null     float64
 4   bp      

#### 03. 데이터 분리 

In [285]:
# 변수 처리 
# cust_id는 모델에는 반영이 되면 안되는 데이터! 
# cust_id = x_test['cust_id'].copy() # cust_id 복사해두기
x_train.drop(columns='cust_id',inplace=True)
x_test.drop(columns='cust_id',inplace=True)


In [286]:
# 데이터 훈련 세트와 검증용 세트로 분할 (80% 훈련, 20% 검증)
#  stratify=y_train['target'] 는 분류모델일때 
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train['target'],test_size=0.2)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(282, 10)
(71, 10)
(282,)
(71,)


#### 04. 모델링 및 성능평가

##### 당뇨병 환자의 질병 진행 정도 예측
- 데이터의 결측치, 이상치, 변수들에 대해 전처리
- 회귀모델을 사용하여 Rsq, MSE 값 산출
- 제출은 cust_id, target 변수를 가진 DataFrame 형태로 제출  

In [287]:
# 모델 적용
model = RandomForestRegressor()
model.fit(x_train,y_train)

RandomForestRegressor()

In [288]:
# 모델을 이용한 검증용 데이터 예측 
y_pred=model.predict(x_val)


In [289]:
mse = mean_squared_error(y_val,y_pred)
r2 = r2_score(y_val,y_pred)
print('MSE : ',mse )
print('r2 : ', r2)

MSE :  3788.393912676057
r2 :  0.38753655533689035


In [290]:
model = RandomForestRegressor()
model.fit(x_train,y_train)
y_pred=model.predict(x_val)
mse = mean_squared_error(y_val,y_pred)
r2 = r2_score(y_val,y_pred)
print('MSE : ',mse )
print('r2 : ', r2)

MSE :  3980.993661971831
r2 :  0.35639926903193586
