# 9. Neural Networks for Gender Classification

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

## 9.1 Prepare Data

In [3]:
# https://www.data.go.kr/dataset/15007122/fileData.do
data = pd.read_csv("data/NHIS_OPEN_GJ_2017.csv", encoding ='euc-kr')

In [4]:
# 연령 : (총 14개 그룹) 20~84세까지 5세 단위 그룹화, 85세 이상은 85+로 그룹화
# 흡연상태 : 1(피우지 않는다), 2(이전에 피웠으나 끊었다), 3(현재도 피우고 있다)
# 음주여부 : 0(마시지 않은다), 1(마신다)
data.head()

Unnamed: 0,기준년도,가입자일련번호,성별코드,연령대코드(5세단위),시도코드,신장(5Cm단위),체중(5Kg 단위),허리둘레,시력(좌),시력(우),...,감마지티피,흡연상태,음주여부,구강검진 수검여부,치아우식증유무,결손치유무,치아마모증유무,제3대구치(사랑니)이상,치석,데이터공개일자
0,2017,1,1,13,46,170.0,65.0,91.0,1.0,1.2,...,25.0,3.0,0.0,1,,,,,1.0,20181126
1,2017,2,2,8,41,150.0,45.0,73.4,1.2,1.0,...,10.0,1.0,0.0,1,,,,,1.0,20181126
2,2017,3,1,8,45,175.0,75.0,94.0,1.0,0.8,...,136.0,1.0,0.0,1,,,,,0.0,20181126
3,2017,4,2,12,11,155.0,55.0,67.5,0.9,1.0,...,30.0,1.0,1.0,0,,,,,,20181126
4,2017,5,1,8,41,175.0,75.0,93.0,1.5,1.5,...,68.0,3.0,0.0,0,,,,,,20181126


In [5]:
# 성별코드, 연령대코드, 신장, 체중, 흡연상태, 음주여부
df = data[data.columns[[2,3,5,6,25,26]]]

In [6]:
# 결측치 존재 확인
print("The number of data :",len(df))
df.isnull().any()

The number of data : 1000000


성별코드           False
연령대코드(5세단위)    False
신장(5Cm단위)       True
체중(5Kg 단위)      True
흡연상태            True
음주여부            True
dtype: bool

In [7]:
# 결측치 있는 행 없애기
df = df.dropna()

In [8]:
print("The number of data :",len(df))
df.isnull().any()

The number of data : 999102


성별코드           False
연령대코드(5세단위)    False
신장(5Cm단위)      False
체중(5Kg 단위)     False
흡연상태           False
음주여부           False
dtype: bool

## 9.2 Train Test split

In [9]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=0)

In [10]:
# y = 0(남자) or 1(여자) 로 변환
x_train  = torch.from_numpy(df_train.values[:, 1:]).float()
y_train  = torch.from_numpy(df_train.values[:, 0]).float() - 1

x_test  = torch.from_numpy(df_test.values[:, 1:]).float()
y_test  = torch.from_numpy(df_test.values[:, 0]).float() - 1

In [11]:
# x, y 데이터 결합
train_data = TensorDataset(x_train, y_train)
test_data = TensorDataset(x_test, y_test)

In [12]:
batch_size = 200

train_loader  = DataLoader(dataset=train_data,
                           batch_size=batch_size,
                           shuffle=True,
                           num_workers=1)

# dataset (Dataset) – dataset from which to load the data.
# batch_size (int, optional) – how many samples per batch to load (default: 1).
# shuffle (bool, optional) – set to True to have the data reshuffled at every epoch (default: False).
# num_workers (int, optional) – how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0)

## 9.3 Define Model

In [13]:
model = nn.Sequential(
    nn.Linear(5, 100),
    nn.ReLU(),
    nn.Linear(100, 10),
    nn.ReLU(),
    nn.Linear(10, 1),
    nn.Sigmoid()
)

print(model)

Sequential(
  (0): Linear(in_features=5, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=10, bias=True)
  (3): ReLU()
  (4): Linear(in_features=10, out_features=1, bias=True)
  (5): Sigmoid()
)


In [14]:
loss = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

## 9.4 Train Model

In [15]:
num_epochs = 10

In [16]:
for epoch in range(num_epochs):
    
    total_batch = len(train_data) // batch_size
    
    for i, (batch_data, batch_labels) in enumerate(train_loader):
        
        X = batch_data
        Y = batch_labels
        
        pre = model(X)
        cost = loss(pre, Y.view(-1, 1))
        
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        if (i+1) % 2000 == 0:
            print('Epoch [%d/%d], lter [%d/%d], Loss: %.4f'
                 %(epoch+1, num_epochs, i+1, total_batch, cost.item()))
    
print("Learning Finished!")

Epoch [1/10], lter [2000/4495], Loss: 0.3997
Epoch [1/10], lter [4000/4495], Loss: 0.3698
Epoch [2/10], lter [2000/4495], Loss: 0.3682
Epoch [2/10], lter [4000/4495], Loss: 0.3063
Epoch [3/10], lter [2000/4495], Loss: 0.2660
Epoch [3/10], lter [4000/4495], Loss: 0.2794
Epoch [4/10], lter [2000/4495], Loss: 0.2600
Epoch [4/10], lter [4000/4495], Loss: 0.2266
Epoch [5/10], lter [2000/4495], Loss: 0.2045
Epoch [5/10], lter [4000/4495], Loss: 0.2386
Epoch [6/10], lter [2000/4495], Loss: 0.2629
Epoch [6/10], lter [4000/4495], Loss: 0.2071
Epoch [7/10], lter [2000/4495], Loss: 0.2505
Epoch [7/10], lter [4000/4495], Loss: 0.2185
Epoch [8/10], lter [2000/4495], Loss: 0.1664
Epoch [8/10], lter [4000/4495], Loss: 0.1975
Epoch [9/10], lter [2000/4495], Loss: 0.2486
Epoch [9/10], lter [4000/4495], Loss: 0.2886
Epoch [10/10], lter [2000/4495], Loss: 0.2028
Epoch [10/10], lter [4000/4495], Loss: 0.2536
Learning Finished!


## 9.5 Test Model

In [17]:
model.eval()

correct = 0
total = 0

for data, label in test_data:
    
    output = model(data)
    
    pre = (output > 0.5).float()
    total += 1
    correct += (pre == label).sum()
    
print('Accuracy of test images: %f %%' % (100 * float(correct) / total))

Accuracy of test images: 91.873768 %
