### 선형 분류 - 로지스틱 회귀 (Logistic Regression)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

from adamp import AdamP
from adamp import SGDP

from torch.utils.data import DataLoader, Dataset

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

#### 데이터셋을 만들기 - make_classification 함수를 사용하여 무작위로 데이터셋을 만들기

In [2]:
x, y  = make_classification(
    n_samples=3000,         # 생성할 데이터 수 
    n_features=5,           # 독립변수 수
    n_informative=2,        # 독립변수 수 중에서 실제로 유의미한 의미가 있는 변수 개수
    n_redundant=0,          # 독립변수 중에서 다른 독립 변수로부터 파생된 불필요한 독립변수 계수
    n_clusters_per_class=1, # 클래스당 클러스트 계수
    random_state=42         # 난수 생성 발생기의 시드값
)

print(x,y)

[[-0.02439857 -0.57540077  1.26796049 -1.42222965 -0.9629849 ]
 [-1.07638119  0.3872175   1.08299994 -0.67379011 -2.65098736]
 [-1.12984986 -0.26922916  1.12735009 -0.82383687 -1.70574586]
 ...
 [-0.53797853  0.26401859 -0.48915618  0.4664446  -1.57451325]
 [ 0.01920342  0.9761859  -0.14717165 -1.51725386  2.31873002]
 [-0.37051336  0.93603022 -0.62133172 -0.23084897  1.66473405]] [0 0 0 ... 0 1 1]


#### 데이터셋을 Train, Test set으로 나누기

In [3]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(len(X_train), len(X_test), len(y_train), len(y_test))

2400 600 2400 600


#### Pytorch의 Dataset과 Dataloader를 사용하기 위한 Custom Dataset 구현

In [4]:
class MyCustomDataset(Dataset) :
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self) :
        return len(self.x)

# 데이터셋 정의
train_dataset = MyCustomDataset(X_train, y_train)
test_dataset = MyCustomDataset(X_test, y_test) 

# 데이터 로드 정의
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# 디버깅 코드
# test = MyCustomDataset(X_test, y_test)
# for i in test : 
#     print(i)

for index, (datas, labels) in enumerate(test_loader) :
    print(index, datas, labels)

0 tensor([[ 0.0929,  0.5871, -0.0648, -0.5499,  0.2601],
        [ 1.6688,  1.2611,  0.2186, -1.5286,  1.3228],
        [-1.1512,  1.3830,  0.6312,  1.1050, -0.9276],
        [ 0.3363,  1.3278, -1.4711, -0.7370, -1.0695],
        [ 0.1424,  1.2814,  1.3887,  0.3814, -0.4880],
        [ 0.9459,  0.5252, -0.3688,  0.1981,  1.1448],
        [ 2.3017,  0.6260, -0.5517, -0.7631,  1.2242],
        [ 0.1415,  0.2922,  0.4945, -0.1258,  0.2997],
        [ 0.1739,  1.9702, -0.6585, -0.8484,  0.8031],
        [ 0.2991,  0.0198, -0.1113,  0.8332, -1.5018],
        [-0.7635,  0.0621, -0.2997,  0.3835, -1.0306],
        [-1.9838,  0.7791,  0.2252, -0.5202,  0.8284],
        [ 1.5942,  1.4567, -1.5435,  0.1035, -0.5955],
        [ 0.0543,  0.5541,  0.3620,  0.7630, -0.4857],
        [-0.9241,  2.1055, -0.7200, -1.2535, -0.4082],
        [-1.0153,  0.3670,  0.4214,  0.6232,  0.4880],
        [-0.8963,  0.9129,  1.3385, -0.4856,  1.9088],
        [ 1.4714,  1.5626,  0.6962,  0.9075,  1.5130],
        

#### 모델 정의

In [5]:
class LogisticRegression(nn.Module) :
    def __init__(self, input_dim) :
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x) :
        out = self.linear(x)
        out = torch.sigmoid(out)
        return out

model = LogisticRegression(input_dim=5)
print(model)

LogisticRegression(
  (linear): Linear(in_features=5, out_features=1, bias=True)
)


#### 모델을 학습시키기 전에 학습에 필요한 Loss function, optimizer 선언

In [6]:
criterion = nn.BCELoss() # 사용한 이유는 0과 1 분류 이진분류하기 때문
optimizer = SGDP(model.parameters(), lr=0.01, weight_decay=1e-5, momentum=0.9, nesterov=True)
print(optimizer)

SGDP (
Parameter Group 0
    dampening: 0
    delta: 0.1
    eps: 1e-08
    lr: 0.01
    momentum: 0.9
    nesterov: True
    wd_ratio: 0.1
    weight_decay: 1e-05
)


#### Train 코드

In [7]:
num_epochs = 100
for epoch in range(num_epochs) :
    for inputs, targets in train_loader :
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.unsqueeze(1))
        loss.backward()
        optimizer.step()
    
    if epoch % 10 == 0 :
        print(f"Epoch : [{epoch+1}/{num_epochs}], Loss : [{loss.item():.4f}]")

Epoch : [1/100], Loss : [0.2898]
Epoch : [11/100], Loss : [0.1417]
Epoch : [21/100], Loss : [0.3312]
Epoch : [31/100], Loss : [0.2389]
Epoch : [41/100], Loss : [0.1977]
Epoch : [51/100], Loss : [0.2509]
Epoch : [61/100], Loss : [0.1538]
Epoch : [71/100], Loss : [0.2276]
Epoch : [81/100], Loss : [0.2548]
Epoch : [91/100], Loss : [0.1858]


#### Evaluation 코드

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device : {device}")

model.eval()
with torch.no_grad() :
    correct = 0 
    total = 0
    for test_inputs, test_targets in test_loader : 
        test_input, test_target = test_inputs.to(device), test_targets.to(device)
        outputs_test = model(test_input)
        _, pred_test = torch.max(outputs_test, 1)
        total += test_targets.size(0)
        correct += (pred_test == test_targets).sum().item()
    print("Acc : %d%%"%(100*correct/total))

Using Device : cpu
Acc : 48%
