# **딥 러닝 기반 이진 분류 모델**

## NVIIDA-GPU 세팅 확인

In [None]:
!nvidia-smi

## **데이터셋 불러오기**

In [None]:
# 사용할 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/content/smoking.csv")

### 데이터셋 살펴보기

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
df.shape

## **입력 및 목표 (Target) 데이터 분리**

In [None]:
inputData, targetData = df.drop(columns=["smoking"], axis=1), df["smoking"]

### 입력 변수 명칭 확인

In [None]:
inputData.columns

### 입력 변수 데이터 타입 확인

In [None]:
inputData.info()

### 목표 (Target) 변수 확인

In [None]:
targetData

## **데이터셋 전처리**

### One-Hot Encoding

In [None]:
inputDataOHE = inputData.loc[:, ["gender","oral","tartar"]]

In [None]:
inputDataOHE

In [None]:
inputDataOHE = pd.get_dummies(inputDataOHE)

In [None]:
inputDataOHE

### Min-Max Normalization

In [None]:
def MinMaxNorm(dataFrame) :
  return (dataFrame-dataFrame.min())/(dataFrame.max()-dataFrame.min())

In [None]:
inputDataMMN = MinMaxNorm(inputData.drop(["gender","oral","tartar"], axis=1))

In [None]:
inputDataMMN

### 전처리가 적용된 데이터셋 합치기

In [None]:
inputData = pd.concat([inputDataOHE, inputDataMMN], axis=1)

In [None]:
inputData

### 훈련 및 시험 데이터셋 분리

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
inputData, targetData = np.array(inputData, dtype=np.float64), np.array(targetData) # Boolean → Numerical Value (0 or 1)

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(inputData, targetData, test_size=0.1, random_state=42)

In [None]:
xTrain, xValid, yTrain, yValid = train_test_split(xTrain, yTrain, test_size=0.1, random_state=42)

In [None]:
print(xTrain.shape, yTrain.shape)
print(xValid.shape, yValid.shape)
print(xTest.shape, yTest.shape)

In [None]:
xTrain

## **PyTorch DataLoader Class 생성**

In [None]:
import torch
from torch.utils.data import Dataset

In [None]:
class myDataLoader(Dataset) :
  def __init__(self, inputData:np.array, targetData:np.array) :
    # Inheritance
    super(myDataLoader, self).__init__()

    # Initialize Variable
    self.inputData = inputData
    self.targetData = targetData

  def __getitem__(self, index) :
    input = self.inputData[index, :]
    target = self.targetData[index]

    input = torch.as_tensor(input)
    target = torch.as_tensor(target).unsqueeze(0)

    return {"input":input.float(), "target":target.float()}

  def __len__(self) :
    return len(self.inputData)

## **PyTorch 분류 모델 생성**

In [None]:
from torch import nn
import torch.nn.functional as F

In [None]:
class myModel(nn.Module) :
  def __init__(self, inputDim:int, targetDim:int, channels:int) :
    # Inheritance
    super(myModel, self).__init__()

    # Create MLP Layer Instance
    self.layer0 = nn.Linear(inputDim, channels)
    self.layer1 = nn.Linear(channels, channels*2)
    self.layer2 = nn.Linear(channels*2, channels)
    self.layer3 = nn.Linear(channels, targetDim)

  def forward(self, input) :
    output = F.relu(self.layer0(input))
    output = F.relu(self.layer1(output))
    output = F.relu(self.layer2(output))
    output = self.layer3(output)

    return output

## **훈련 하이퍼파라미터 생성**

In [None]:
opt = {"seed":42,
       "batchSize":128,
       "lr":1e-3,
       "epochs":50,
       "isCUDA":torch.cuda.is_available()}

## **딥 러닝 모델 훈련**

In [None]:
# 사용할 라이브러리 불러오기
from torch.utils.data import DataLoader
from torch import optim

from torchsummary import summary

from tqdm import tqdm

### Seed 고정

In [None]:
import random

In [None]:
def fixSeed(seed) :
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

### **훈련 과정 요약을 위한 Average Meter 인스턴스 생성**

In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val*n
        self.count += n
        self.avg = self.sum / self.count

### **정확도 계산을 위한 Function 생성**

In [None]:
def computeAcc(pred, target) :
  pred = torch.where(pred>0, 1, 0)
  acc = (pred==target).sum()/pred.size(0)

  return acc

## **추상화된 딥 러닝 모델 훈련 코드 작성**

In [None]:
def train(opt, dataset, criterion) :
  fixSeed(opt["seed"])

  trainDataLoader = DataLoader(myDataLoader(dataset["xTrain"], dataset["yTrain"]), batch_size=opt["batchSize"], shuffle=True, drop_last=True)
  validDataLoader = DataLoader(myDataLoader(dataset["xValid"], dataset["yValid"]), batch_size=opt["batchSize"], shuffle=False, drop_last=False)

  fixSeed(opt["seed"])
  model = myModel(xTrain.shape[1], 1, 64)
  if opt["isCUDA"] :
    model = model.cuda()

  summary(model, (1, dataset["xTrain"].shape[1]))

  optimizer = optim.Adam(model.parameters(), lr=opt["lr"])

  trainLoss, validLoss = AverageMeter(), AverageMeter()
  trainAcc, validAcc = AverageMeter(), AverageMeter()
  trainLossList, validLossList = [], []
  trainAccList, validAccList = [], []
  bestAcc = 0

  for epoch in range(1, opt["epochs"]+1) :
    trainBar = tqdm(trainDataLoader)
    trainLoss.reset(), trainAcc.reset()

    for data in trainBar :
      input, target = data["input"], data["target"]
      if opt["isCUDA"] :
        input, target = input.cuda(), target.cuda()

      optimizer.zero_grad()
      pred = model(input)
      loss = criterion(pred, target)
      loss.backward()
      optimizer.step()

      trainLoss.update(loss.item(), opt["batchSize"])
      trainAcc.update(computeAcc(pred, target).item(), opt["batchSize"])
      trainBar.set_description(desc=f"[{epoch}/{opt['epochs']}] [Train] < Accuracy:{trainAcc.avg:.6f} | Loss:{trainLoss.avg:.6f} >")

    trainLossList.append(trainLoss.avg)
    trainAccList.append(trainAcc.avg)

    validBar = tqdm(validDataLoader)
    validLoss.reset(), validAcc.reset()

    for data in validBar :
      input, target = data["input"], data["target"]
      if opt["isCUDA"] :
        input, target = input.cuda(), target.cuda()

      model.eval()
      with torch.no_grad() :
        pred = model(input)
        loss = criterion(pred, target)

        validLoss.update(loss.item(), opt["batchSize"])
        validAcc.update(computeAcc(pred, target).item(), opt["batchSize"])
        validBar.set_description(desc=f"[{epoch}/{opt['epochs']}] [Valid] < Accuracy:{validAcc.avg:.6f} | Loss:{trainLoss.avg:.6f} >")

    validLossList.append(validLoss.avg)
    validAccList.append(validAcc.avg)

    if validAcc.avg > bestAcc :
      bestAcc = validAcc.avg
      torch.save(model.state_dict(), "bestModel.pth")

    torch.save(model.state_dict(), "latestModel.pth")

  return (trainLossList, validLossList), (trainAccList, validAccList)

## **딥 러닝 모델 훈련 진행**

In [None]:
lossList, accList = train(opt,
                          {"xTrain":xTrain, "yTrain":yTrain, "xValid":xValid, "yValid":yValid},
                          nn.BCEWithLogitsLoss())

### 훈련 및 검증 손실 함수 시각화

In [None]:
plt.figure(figsize=(20,10))

plt.plot(np.arange(0, opt["epochs"], 1), lossList[0], label="Training Loss")
plt.plot(np.arange(0, opt["epochs"], 1), lossList[1], label="Validation Loss")

plt.xlabel("Epoch")
plt.ylabel("BCE Loss")
plt.legend(loc="best")

plt.show()

### 훈련 및 검증 정확도 시각화

In [None]:
plt.figure(figsize=(20,10))

plt.plot(np.arange(0, opt["epochs"], 1), accList[0], label="Training Accuracy")
plt.plot(np.arange(0, opt["epochs"], 1), accList[1], label="Validation Accuracy")

plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(loc="best")

plt.show()

## **추상화된 딥 러닝 모델 추론 코드 작성**

In [None]:
def inference(opt, inputData, modelPath) :
  weights = torch.load(modelPath)

  model = myModel(xTrain.shape[1], 1, 64)
  model.load_state_dict(weights)
  if opt["isCUDA"] :
    model = model.cuda()

  inputDataTensor = torch.as_tensor(inputData).float()

  predList = []

  model.eval()

  with torch.no_grad() :
    with tqdm(total=inputData.shape[0]) as pBar :
      for inputData in inputDataTensor :
        if opt["isCUDA"] :
          inputData = inputData.cuda()

        pred = model(inputData)
        predList.append(pred.detach().cpu().item())

        pBar.update()

  return predList

### 시험 데이터셋 추론 진행

In [None]:
predList = inference(opt, xTest, "/content/bestModel.pth")

In [None]:
predList

In [None]:
predList = np.where(np.array(predList)>0, 1, 0) # Threshold

In [None]:
predList

### **시험 데이터셋 추론 결과의 정량적 분석 진행**

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
accScore = accuracy_score(yTest, predList)
print(accScore)

In [None]:
cm = confusion_matrix(yTest, predList)
print(cm)

In [None]:
clsRp = classification_report(yTest, predList)
print(clsRp)