## Neural Networks를 이용한 당뇨병 예측

## 0. 라이브러리

In [None]:
import sys, os

import pandas as pd
import numpy as np

from mlxtend.preprocessing import minmax_scaling
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

## 1. 데이터

* https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database
* 이 데이터셋은 미국 국립당뇨병 및 소화기 및 신장 질환 연구소(National Institute of Diabetes and Digestive and Kidney Diseases)에서 제공한 데이터입니다.
* 이 데이터셋의 목적은 데이터셋에 포함된 특정 진단 측정값을 기반으로 환자가 당뇨병을 가지고 있는지 여부를 진단적으로 예측하는 것이며, 데이터셋의 모든 환자는 최소나이가 21세인 피마 인디언 유산의 여성들입니다.

### 1) 다운로드/로드

In [None]:
import kagglehub

# Download latest version
# path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")

# print("Path to dataset files:", path)

In [None]:
DATASET_PATH = 'dataset/diabetes.csv'
df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### 2) 탐색

#### 컬럼 정보

* Pregnancies: 임신 횟수
* Glucose: 혈장 내 포도당 농도
* BloodPressure: 이완기 혈압 (mm Hg)
* SkinThickness: 삼두근 피부접힘 두께 (mm)
* Insulin: 혈청 인슐린 농도 (mu U/ml)
* BMI: 체질량 지수
* DiabetesPedigreeFunction: 유전적 관계 정보
* Age: 나이
* Outcome: 클래스 (0은 정상, 1은 당뇨병)

In [9]:
# 기초 통계 확인
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [10]:
# 결측치 확인
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [None]:
# 최솟값 확인 -> 이상 데이터 (Pregnancies, Outcome을 제외하고는 결측값이 있을 수 있음)
def highlight_min(s):
    is_max = s==s.min()
    return ['background-color: limegreen' if v else '' for v in is_max]

df.describe().T.style.apply(highlight_min, subset=['min'])

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [19]:
# 데이터 정보 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


### 3) 전처리

* 결측치 처리
> 1. 0 값을 nan으로 대체
> 2. nan을 중앙값으로 대체

In [3]:
# 결측치 처리 (값이 0일 수 없는 컬럼)
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)

In [4]:
# 결측값 비율 확인
percent_missing = df.isnull().mean().round(4)*100
percent_missing

Pregnancies                  0.00
Glucose                      0.65
BloodPressure                4.56
SkinThickness               29.56
Insulin                     48.70
BMI                          1.43
DiabetesPedigreeFunction     0.00
Age                          0.00
Outcome                      0.00
dtype: float64

In [5]:
# 시각화
trace = go.Bar(x=percent_missing.index, y=percent_missing.values,
                opacity=0.8, text=percent_missing.values.round(4), textposition='auto',
                marker=dict(color='#90EE90', line=dict(color='#000000', width=1.25)))

layout = dict(title='Missing Values (count & %)')

fig = dict(data=[trace], layout=layout)
py.iplot(fig)

In [6]:
# 결측치 -> 중앙값으로 대체
def median_target(data, col):
    temp = data[data[col].notnull()]
    temp = temp[[col, 'Outcome']].groupby(['Outcome'])[[col]].median().reset_index() # Outcome이 0인 애들은 0인 애들까리, 1인 애들은 1인 애들끼리

    return temp

In [7]:
def replace_median(data, cols):
    for col in cols:
        target = median_target(data, col)
        display(target)
        
        data.loc[(data['Outcome']==0)&(data[col].isnull()), col] = target[[col]].values[0][0]
        data.loc[(data['Outcome']==1)&(data[col].isnull()), col] = target[[col]].values[1][0]

In [8]:
# temp = df[df['BloodPressure'].notnull()]
# temp = temp[['BloodPressure', 'Outcome']].groupby(['Outcome'])[['BloodPressure']].median().reset_index() # Outcome이 0인 애들은 0인 애들까리, 1인 애들은 1인 애들끼리
# df.loc[(df['Outcome']==0)&(df['BloodPressure'].isnull()), 'BloodPressure']
# temp[['BloodPressure']].values[0][0]

In [9]:
col_with_null = ['Glucose', 'BloodPressure','SkinThickness','Insulin', 'BMI']

replace_median(df, col_with_null)

Unnamed: 0,Outcome,Glucose
0,0,107.0
1,1,140.0


Unnamed: 0,Outcome,BloodPressure
0,0,70.0
1,1,74.5


Unnamed: 0,Outcome,SkinThickness
0,0,27.0
1,1,32.0


Unnamed: 0,Outcome,Insulin
0,0,102.5
1,1,169.5


Unnamed: 0,Outcome,BMI
0,0,30.1
1,1,34.3


In [10]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

### 4) 시각화

In [17]:
features = list(df.columns)
features

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [12]:
# 당뇨병(D), 정상(H) 구분
D = df[df['Outcome']!=0]
H = df[df['Outcome']==0]

D.sample(5)
# H.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
546,5,187.0,76.0,27.0,207.0,43.6,1.034,53,1
397,0,131.0,66.0,40.0,169.5,34.3,0.196,22,1
280,0,146.0,70.0,32.0,169.5,37.9,0.334,28,1
114,7,160.0,54.0,32.0,175.0,30.5,0.588,39,1
468,8,120.0,74.5,32.0,169.5,30.0,0.183,38,1


In [13]:
# 컬럼별 분포도
def plot_distribution(feature, bin_size):
    temp_d = D[feature]
    temp_h = H[feature]
    hist_data = [temp_d, temp_h]

    group_labels = ['diabetic', 'healty']
    colors = ['#00FA9A', '#2F4F4F']

    # ff: figure_factory
    fig = ff.create_distplot(hist_data, group_labels, colors=colors,
                                show_hist=True, bin_size=bin_size, curve_type='kde')

    fig['layout'].update(title=feature)
    
    # py: offline
    py.iplot(fig)


In [14]:
# 이상치
def plot_outliers(df, feature):
    trace0 = go.Box(
        y=df[feature],
        name='All Points',
        jitter=0.3,
        pointpos=-1.8,
        boxpoints='all',
        marker=dict(color='rgb(32,178,170)'),
        line=dict(color='rgb(32,178,170)')
    )

    trace1 = go.Box(
        y=df[feature],
        name='Only Whiskers',
        boxpoints=False,
        marker=dict(color='rgb(0,128,128)'),
        line = dict(color='rgb(0,128,128)')
    )

    trace2 = go.Box(
        y=df[feature],
        name='Suspected Outliers',
        boxpoints='suspectedoutliers',
        marker=dict(
            color='rgb(0,250,154)',
            outliercolor='#FF69B4',
            line=dict(
                outliercolor='#FF69B4',
                outlierwidth=2)),
        line=dict(color='rgb(0,250,154)')
    )

    trace3 = go.Box(
        y=df[feature],
        name='Whiskers and Outliers',
        boxpoints='outliers',
        marker=dict(color='rgb(47,79,79)'),
        line = dict(color='rgb(47,79,79)')
    )

    data = [trace0, trace1, trace2, trace3]

    layout = go.Layout(
        title='{} Outliers'.format(feature)
    )

    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)

In [18]:
# 모든 컬럼 시각화
def plot_all_feature():
    for feature in features[:-1]:
        plot_distribution(feature, 0) # 분포도
        plot_outliers(df, feature) # 이상치
    
    plot_outliers(df, features[-1])

In [19]:
plot_all_feature()

### 5) 전처리

* 이상치 처리
>* IQR(사분위간 범위): 이상치를 찾는데 가장 일반적으로 사용되는 방법
>* 데이터셋의 상한값과 하한값은 각각 IQR(Q3-Q1)의 1.5배로 정의 -> 해당 범위를 벗어나면 이상치로 간주

* 스케일링
>* Min Max Scaling

#### 이상치 제거

In [20]:
def remove_outliers(df_with_out, feature, drop=False):

    value_of_feature = df_with_out[feature]

    Q1 = np.percentile(value_of_feature, 25.) # Q1 (25th percentile) for the given feature
    Q3 = np.percentile(value_of_feature, 75.) # Q3 (75th percentile) for the given feature

    step = 1.5*(Q3-Q1)

    index_outliers = value_of_feature[~((value_of_feature>=Q1-step)&(value_of_feature<=Q3+step))].index.tolist()
    value_outliers = value_of_feature[~((value_of_feature>=Q1-step)&(value_of_feature<=Q3+step))].values

    # Remove the outliers, if specified
    print ('Number of outliers (inc duplicates): {} and outliers: {}'.format(len(index_outliers), value_outliers))
    if drop:
        good_data = df_with_out.drop(df_with_out.index[index_outliers]).reset_index(drop=True)
        print ('New dataset with removed outliers has {} samples with {} features each.'.format(*good_data.shape))
        return good_data
    else:
        print ('Nothing happens, df.shape:', df_with_out.shape)
        return df_with_out

In [21]:
features

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [22]:
# Pregnancies 이상치 제거
df_clean = remove_outliers(df, features[0], True)

Number of outliers (inc duplicates): 4 and outliers: [15 17 14 14]
New dataset with removed outliers has 764 samples with 9 features each.


In [None]:
# Glucose 이상치 제거 -> 이상치 없음
df_clean = remove_outliers(df_clean, features[1], True)
plot_outliers(df_clean, features[1])

Number of outliers (inc duplicates): 0 and outliers: []
New dataset with removed outliers has 764 samples with 9 features each.


In [None]:
# BloodPressure 이상치 제거
plot_outliers(df_clean, features[2])
df_clean = remove_outliers(df_clean, features[2], True)
plot_outliers(df_clean, features[2])

Number of outliers (inc duplicates): 14 and outliers: [ 30. 110. 108. 122.  30. 110. 108. 110.  24.  38. 106. 106. 106. 114.]
New dataset with removed outliers has 750 samples with 9 features each.


In [None]:
# SkinThickness 이상치 제거
plot_outliers(df_clean, features[3])
df_clean = remove_outliers(df_clean, features[3], True)
plot_outliers(df_clean, features[3])

Number of outliers (inc duplicates): 85 and outliers: [45. 47. 11. 47. 11. 10. 60. 13. 13. 54. 51. 56. 14. 13. 50. 44. 12. 44.
 13. 44. 54. 14.  7. 50. 52. 10. 44. 43. 45. 14. 10. 11. 12. 43. 13. 12.
 48. 43. 43.  8. 13. 14. 12. 49. 46. 46. 11.  8. 12. 63. 12. 45. 13. 48.
 13. 10. 45.  7. 52. 49. 43. 14. 47. 99. 46. 11. 50. 45. 14. 13. 13. 47.
 12. 48. 43. 46. 45. 10. 46. 49. 11. 13. 46. 44. 48.]
New dataset with removed outliers has 665 samples with 9 features each.


In [27]:
# Insulin 이상치 제거
df_clean = remove_outliers(df_clean, features[4], True)
plot_outliers(df_clean, features[4])

Number of outliers (inc duplicates): 41 and outliers: [846. 300. 342. 304. 495. 325. 284. 485. 285. 495. 318. 280. 271. 478.
 744. 370. 680. 402. 375. 360. 325. 293. 285. 275. 310. 474. 277. 328.
 480. 274. 330. 600. 293. 272. 321. 440. 540. 480. 335. 291. 392.]
New dataset with removed outliers has 624 samples with 9 features each.


In [29]:
# BMI 이상치 제거
df_clean = remove_outliers(df_clean, features[5], True)
plot_outliers(df_clean, features[5])

Number of outliers (inc duplicates): 6 and outliers: [52.3 52.9 48.3 57.3 49.6 49.3]
New dataset with removed outliers has 618 samples with 9 features each.


In [None]:
# DiabetesPedigreeFunction 이상치 제거
df_clean = remove_outliers(df_clean, features[6], True)
plot_outliers(df_clean, features[6])

Number of outliers (inc duplicates): 27 and outliers: [2.288 1.441 1.893 1.781 1.222 1.4   1.189 1.321 1.224 1.318 1.213 1.353
 1.224 1.391 1.476 1.731 1.268 1.191 1.159 1.251 1.699 1.282 1.698 1.461
 1.292 1.174 1.182]
New dataset with removed outliers has 591 samples with 9 features each.


In [32]:
# Age 이상치 제거
df_clean = remove_outliers(df_clean, features[7], True)
plot_outliers(df_clean, features[7])

Number of outliers (inc duplicates): 9 and outliers: [69 67 72 81 67 67 70 68 69]
New dataset with removed outliers has 582 samples with 9 features each.


In [33]:
# Outcome 이상치 제거
df_clean = remove_outliers(df_clean, features[8], True)
plot_outliers(df_clean, features[8])

Number of outliers (inc duplicates): 0 and outliers: []
New dataset with removed outliers has 582 samples with 9 features each.


In [34]:
print(
'''Original df.shape: {},

-- New df.shape: {},
-- {} rows removed
-- which is {}% of our data'''.format(df.shape[0],df_clean.shape[0],
                                        df.shape[0]-df_clean.shape[0],
                                        (df.shape[0]-df_clean.shape[0])/df.shape[0]*100))

Original df.shape: 768,

-- New df.shape: 582,
-- 186 rows removed
-- which is 24.21875% of our data


#### MinMaxScaling

In [39]:
scaled_data = minmax_scaling(df_clean, columns=list(df.columns)[:-1])
scaled_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.461538,0.684211,0.466667,0.740741,0.604331,0.518519,0.510223,0.644444
1,0.076923,0.269737,0.366667,0.518519,0.340551,0.282828,0.253717,0.222222
2,0.615385,0.914474,0.333333,0.629630,0.604331,0.171717,0.552045,0.244444
3,0.076923,0.296053,0.366667,0.296296,0.307087,0.333333,0.082714,0.000000
4,0.384615,0.473684,0.500000,0.444444,0.340551,0.249158,0.114312,0.200000
...,...,...,...,...,...,...,...,...
577,0.692308,0.296053,0.300000,0.444444,0.340551,0.144781,0.059480,0.266667
578,0.153846,0.513158,0.433333,0.444444,0.340551,0.626263,0.243494,0.133333
579,0.384615,0.506579,0.466667,0.296296,0.377953,0.269360,0.155204,0.200000
580,0.076923,0.539474,0.266667,0.629630,0.604331,0.400673,0.251859,0.577778


## 2. 모델링

### 1) 모델 로드

In [43]:
# 이진 분류(Binary Classification)
class CustomModel(nn.Module):

    def __init__(self, input_dim):
        super().__init__() # 부모 클래스(nn.Module) 초기화
        self.layer1 = nn.Linear(input_dim, 8)
        self.layer2 = nn.Linear(8, 4)
        self.layer3 = nn.Linear(4, 1) # (4, 2)로 한다면 Sigmoid 대신 Softmax로!
        self.leaky_relu = nn.LeakyReLU(negative_slope=0.01)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.layer1(x)
        x = self.leaky_relu(x)
        x = self.layer2(x)
        x = self.leaky_relu(x)
        x = self.layer3(x)
        x = self.sigmoid(x)
        return x

In [44]:
# 모델 생성
def build_model(input_dim):
    model = CustomModel(input_dim)
    optimizer = optim.Adam(model.parameters(), lr=1e-3, eps=1e-7)
    criterion = nn.BCELoss() # 이진 분류 손실 함수(BCE: Binary Cross Entropy)

    return model, optimizer, criterion

In [48]:
# scaled_data.keys()
scaled_data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [50]:
input_dim = len(scaled_data.columns)
model, optimizer, criterion = build_model(input_dim)

In [51]:
model.parameters

<bound method Module.parameters of CustomModel(
  (layer1): Linear(in_features=8, out_features=8, bias=True)
  (layer2): Linear(in_features=8, out_features=4, bias=True)
  (layer3): Linear(in_features=4, out_features=1, bias=True)
  (leaky_relu): LeakyReLU(negative_slope=0.01)
  (sigmoid): Sigmoid()
)>

### 2) 데이터셋 분할

In [53]:
# 훈련+검증/테스트
x_temp, x_test, y_temp, y_test = train_test_split(scaled_data, df_clean['Outcome'], test_size=0.2, random_state=42)

# 훈련/검증
x_train, x_val, y_train, y_val = train_test_split(x_temp, y_temp, test_size=0.25, random_state=42)

In [55]:
# 데이터 변환 (pandas numpy -> tensor -> dataloader)
train_data = torch.tensor(x_train.values, dtype=torch.float32)
train_target = torch.tensor(y_train.values, dtype=torch.float32)
train_dataset = TensorDataset(train_data, train_target)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_data = torch.tensor(x_val.values, dtype=torch.float32)
val_target = torch.tensor(y_val.values, dtype=torch.float32)
val_dataset = TensorDataset(val_data, val_target)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

test_data = torch.tensor(x_test.values, dtype=torch.float32)
test_target = torch.tensor(y_test.values, dtype=torch.float32)
test_dataset = TensorDataset(test_data, test_target)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### 3) 모델 학습

In [None]:
# Checkpoint: 원하는 경로에 val_loss 기준으로 모델 저장
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

# 체크포인트 저장
def save_checkpoint(epoch, model, optimizer):
    model_save_path = os.path.join(model_save_dir, f'NN-{epoch:04d}.pth')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, model_save_path)

In [None]:
# 모델 평가
def evaluate_model(model, dataloader, criterion):
    
    # 모델을 평가 모드로 설정
    model.eval()
    
    total_loss = 0.0
    total_accuracy = 0.0
    total_precision = 0.0
    total_recall = 0.0
    total_f1 = 0.0

    with torch.no_grad(): # 가중치 업데이트 X
        for inputs, labels in dataloader:
            outputs = model(inputs) # 모델 예측값
            loss = criterion(outputs, labels.view(-1, 1)) # 손실 함수 계산

            predictions = (outputs>=0.5).float() # if 0.7 -> True -> 1.0 / 0.3 -> False -> 0.0
            total_accuracy += torch.mean((predictions==labels.view(-1, 1)).float()).item()

            tp = torch.sum(predictions*labels.view(-1, 1)).float() # True Positive
            fp = torch.sum(predictions)-tp # False Positive
            fn = torch.sum(labels.view(-1, 1))-tp # False Negative

            precision = tp/(tp + fp + 1e-8) # 정밀도 (TP/(TP+FP))
            recall = tp/(tp + fn + 1e-8) # 재현율 (TP/(TP+FN))
            f1 = 2*(precision*recall)/(precision+recall+1e-8) # F1-score

            total_precision += precision.item()
            total_recall += recall.item()
            total_f1 += f1.item()
            total_loss += loss.item()

    # 누적된 값들의 평균 계산
    average_loss = total_loss/len(dataloader)
    average_accuracy = total_accuracy/len(dataloader)
    average_precision = total_precision/len(dataloader)
    average_f1 = total_f1/len(dataloader)
    average_recall = total_recall/len(dataloader)

    return average_loss, average_accuracy, average_f1, average_precision, average_recall

In [58]:
# 모델 학습
train_history = {'val_loss': [], 'val_accuracy': [], 'val_f1': [], 'val_precision': [], 'val_recall': []}
best_val_accuracy = 0.0
best_val_loss = float('inf')
current_epoch = 0

for epoch in range(current_epoch, 1000):

    # 모델을 학습 모드로 설정
    model.train()

    # 배치 단위로 학습
    for inputs, labels in train_loader:
        optimizer.zero_grad() # 기울기 초기화
        outputs = model(inputs) # 모델 예측
        loss = criterion(outputs, labels.view(-1, 1)) # 손실 함수 계산
        loss.backward() # 역전파
        optimizer.step() # 가중치 업데이트

    # 검증 (학습 결과 기록)
    with torch.no_grad():
        # 평가 모드
        model.eval()

        # 검증 데이터로 모델 평가 및 검증 지표 계산
        val_loss, val_accuracy, val_f1, val_precision, val_recall = evaluate_model(model, val_loader, criterion)

        # train_history에 값 기록
        train_history['val_loss'].append(val_loss)
        train_history['val_accuracy'].append(val_accuracy)
        train_history['val_f1'].append(val_f1)
        train_history['val_precision'].append(val_precision)
        train_history['val_recall'].append(val_recall)

    # loss 가 개선될때 마다 체크포인트 저장
    if val_loss<best_val_loss:
        best_val_loss = val_loss
        save_checkpoint(epoch, model, optimizer)

    # 최고 검증 정확도 추적
    if val_accuracy>best_val_accuracy:
        best_val_accuracy = val_accuracy

In [61]:
# 최고 점수
def max_metric(history):
    max_val_accuracy = max(history['val_accuracy'])
    min_val_loss = min(history['val_loss'])
    max_val_loss = max(history['val_loss']) # 최대 손실(가장 나쁜 성능) 확인

    print('Maximum Validation Accuracy:', max_val_accuracy)
    print('Minimum Validation Binary CrossEntropy Loss:', min_val_loss)
    
    print('\nMaximum Validation Binary CrossEntropy Loss:', max_val_loss)

max_metric(train_history)

Maximum Validation Accuracy: 0.9095982164144516
Minimum Validation Binary CrossEntropy Loss: 0.30409842543303967

Maximum Validation Binary CrossEntropy Loss: 0.6216293722391129


### 4) 성능 평가

In [62]:
# 테스트 데이터셋 성능 평가
def evaluate(model, dataloader, criterion):

    model.eval()
    total_loss = 0.0
    total_accuracy = 0.0

    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, labels.view(-1, 1))

            predictions = (outputs>=0.5).float()
            total_accuracy += torch.mean((predictions==labels.view(-1, 1)).float()).item()
            total_loss += loss.item()

    average_loss = total_loss/len(dataloader)
    average_accuracy = total_accuracy/len(dataloader)

    return average_loss, average_accuracy

In [67]:
best_path = sorted(os.listdir(model_save_dir))[-1]
model.load_state_dict(torch.load(f'{model_save_dir}/{best_path}')['model_state_dict'])
test_loss, test_acc = evaluate(model, test_loader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.417 | Test Acc: 83.89%
