> 문제 2) 당뇨 진척 정도 모델 생성 

In [164]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score,roc_auc_score, r2_score, mean_squared_error


In [165]:
from sklearn.datasets import load_diabetes

# diabetes 데이터셋 로드
diabets = load_diabetes()

x=pd.DataFrame(diabets.data, columns=diabets.feature_names)
y = pd.DataFrame(diabets.target)

# 실기시험 데이터 셋팅
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=2024)

# y_test는 실제로 주어지는 값 아니므로 무시

x_train = pd.DataFrame(x_train.reset_index())
x_test = pd.DataFrame(x_test.reset_index())
y_train = pd.DataFrame(y_train.reset_index())

x_train.rename(columns={'index':'cust_id'},inplace=True)
x_test.rename(columns={'index' : 'cust_id'},inplace=True)
y_train.columns = ({'cust_id', 'target'})

In [166]:
print(x_train.head())
print(x_test.head())

   cust_id       age       sex       bmi        bp        s1        s2  \
0      424  0.001751  0.050680  0.011039 -0.019442 -0.016704 -0.003819   
1      121  0.063504 -0.044642  0.017506  0.021872  0.008063  0.021546   
2       15 -0.052738  0.050680 -0.018062  0.080401  0.089244  0.107662   
3        5 -0.092695 -0.044642 -0.040696 -0.019442 -0.068991 -0.079288   
4       21 -0.085430  0.050680 -0.022373  0.001215 -0.037344 -0.026366   

         s3        s4        s5        s6  
0 -0.047082  0.034309  0.024053  0.023775  
1 -0.036038  0.034309  0.019908  0.011349  
2 -0.039719  0.108111  0.036056 -0.042499  
3  0.041277 -0.076395 -0.041180 -0.096346  
4  0.015505 -0.039493 -0.072128 -0.017646  
   cust_id       age       sex       bmi        bp        s1        s2  \
0      334 -0.060003  0.050680 -0.047163 -0.022885 -0.071743 -0.057681   
1      313  0.059871  0.050680  0.053074  0.052858  0.032830  0.019667   
2      133 -0.041840  0.050680 -0.053630 -0.040099 -0.084126 -0.07177

In [167]:
print(y_train.head())

   target  cust_id
0     424    111.0
1     121    173.0
2      15    171.0
3       5     97.0
4      21     49.0


In [168]:
x_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cust_id,353.0,219.444759,130.610642,0.0,105.0,220.0,337.0,441.0
age,353.0,-0.000729,0.047086,-0.107226,-0.038207,0.009016,0.034443,0.110727
sex,353.0,-0.001166,0.047544,-0.044642,-0.044642,-0.044642,0.05068,0.05068
bmi,353.0,-0.001373,0.048839,-0.089197,-0.036385,-0.008362,0.028284,0.170555
bp,353.0,-0.000485,0.048083,-0.1124,-0.036656,-0.005671,0.032201,0.132044
s1,353.0,-0.001284,0.048113,-0.126781,-0.035968,-0.004321,0.027326,0.153914
s2,353.0,-0.001635,0.048175,-0.112795,-0.032629,-0.005072,0.027496,0.198788
s3,353.0,0.002004,0.048396,-0.102307,-0.032356,-0.002903,0.030232,0.181179
s4,353.0,-0.002704,0.04726,-0.076395,-0.039493,-0.002592,0.034309,0.185234
s5,353.0,-0.001932,0.047925,-0.126097,-0.034524,-0.00608,0.029936,0.133599


In [169]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  353 non-null    int64  
 1   age      353 non-null    float64
 2   sex      353 non-null    float64
 3   bmi      353 non-null    float64
 4   bp       353 non-null    float64
 5   s1       353 non-null    float64
 6   s2       353 non-null    float64
 7   s3       353 non-null    float64
 8   s4       353 non-null    float64
 9   s5       353 non-null    float64
 10  s6       353 non-null    float64
dtypes: float64(10), int64(1)
memory usage: 30.5 KB


In [170]:
# null 값 확인 
x_train.isnull().sum()

cust_id    0
age        0
sex        0
bmi        0
bp         0
s1         0
s2         0
s3         0
s4         0
s5         0
s6         0
dtype: int64

In [171]:
cust_id = x_train['cust_id'].copy()
x_train.drop(columns='cust_id',axis=1,inplace=True)
x_test.drop(columns='cust_id',inplace=True)

In [172]:
x_train.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.001751,0.05068,0.011039,-0.019442,-0.016704,-0.003819,-0.047082,0.034309,0.024053,0.023775
1,0.063504,-0.044642,0.017506,0.021872,0.008063,0.021546,-0.036038,0.034309,0.019908,0.011349
2,-0.052738,0.05068,-0.018062,0.080401,0.089244,0.107662,-0.039719,0.108111,0.036056,-0.042499
3,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.04118,-0.096346
4,-0.08543,0.05068,-0.022373,0.001215,-0.037344,-0.026366,0.015505,-0.039493,-0.072128,-0.017646


In [173]:
# 데이터 분리 
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train['target'],test_size=0.2,random_state=2024)


In [174]:
y_train

315    119
75     407
216     68
132    418
0      424
      ... 
36     348
27     171
128    405
96     101
136     67
Name: target, Length: 282, dtype: int64

In [175]:
# 모델 생성 및 학습 
model = RandomForestRegressor()
model.fit(x_train,y_train)

RandomForestRegressor()

In [176]:
y_pred = model.predict(x_val)

In [177]:
mse = mean_squared_error(y_val,y_pred)
r2 = r2_score(y_val,y_pred)
print('MSE : ',mse )
print('r2 : ', r2)

MSE :  20611.803112676058
r2 :  -0.12611580495255925
