In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

In [2]:
# for reproducibility
random.seed(777)
torch.manual_seed(777)

<torch._C.Generator at 0x7fd984228730>

### Data Load


In [3]:
import pandas as pd
import torch

# 데이터 불러오기
data = pd.read_csv('./PyTorch/user_behavior_dataset.csv', delimiter=',', encoding='utf-8')
print(data['Operating System'].unique()) # 유일값만 추출
print(data['Gender'].unique())
print(data['User Behavior Class'].unique())
data

['Android' 'iOS']
['Male' 'Female']
[4 3 2 5 1]


Unnamed: 0,User ID,Device Model,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class
0,1,Google Pixel 5,Android,393,6.4,1872,67,1122,40,Male,4
1,2,OnePlus 9,Android,268,4.7,1331,42,944,47,Female,3
2,3,Xiaomi Mi 11,Android,154,4.0,761,32,322,42,Male,2
3,4,Google Pixel 5,Android,239,4.8,1676,56,871,20,Male,3
4,5,iPhone 12,iOS,187,4.3,1367,58,988,31,Female,3
5,6,Google Pixel 5,Android,99,2.0,940,35,564,31,Male,2
6,7,Samsung Galaxy S21,Android,350,7.3,1802,66,1054,21,Female,4
7,8,OnePlus 9,Android,543,11.4,2956,82,1702,31,Male,5
8,9,Samsung Galaxy S21,Android,340,7.7,2138,75,1053,42,Female,4
9,10,iPhone 12,iOS,424,6.6,1957,75,1301,42,Male,4


### Data 전처리

- Device Model 제거
- Operating System 0: 안드로이드, 1: IOS 로 처리
- 성별 0: 남자, 1: 여자
- 데이터 정규화(-1 ~ 1)

In [4]:
data = pd.read_csv('./PyTorch/user_behavior_dataset.csv', delimiter=',', encoding='utf-8')

OS_column = data['Operating System'].unique()
Gender_column = data['Gender'].unique()

# Device Model,User ID 제거
data = data.drop('Device Model', axis=1)
data = data.drop('User ID', axis=1)
# '안드로이드'는 0으로, 'ios'는 1로 변환
data['Operating System'] = data['Operating System'].replace({OS_column[0]: 0, OS_column[1]: 1})
# 남자는 0 여자는 1
data['Gender'] = data['Gender'].replace({Gender_column[0]: 0, Gender_column[1]: 1})
# MinMaxScaler
# 정규화 수식: (x - min) / (max - min)으로 데이터 값을 0과 1 사이로 변환
data.iloc[:,:-1] = (data.iloc[:,:-1] - data.iloc[:,:-1].min())/(data.iloc[:,:-1].max() - data.iloc[:,:-1].min())
print(data.shape)
data

(700, 9)


Unnamed: 0,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class
0,0.0,0.639085,0.490909,0.583426,0.640449,0.425887,0.536585,0.0,4
1,0.0,0.419014,0.336364,0.382386,0.359551,0.351566,0.707317,1.0,3
2,0.0,0.218310,0.272727,0.170569,0.247191,0.091858,0.585366,0.0,2
3,0.0,0.367958,0.345455,0.510591,0.516854,0.321086,0.048780,0.0,3
4,1.0,0.276408,0.300000,0.395764,0.539326,0.369937,0.317073,1.0,3
5,0.0,0.121479,0.090909,0.237087,0.280899,0.192902,0.317073,0.0,2
6,0.0,0.563380,0.572727,0.557414,0.629213,0.397495,0.073171,1.0,4
7,0.0,0.903169,0.945455,0.986250,0.808989,0.668058,0.317073,0.0,5
8,0.0,0.545775,0.609091,0.682274,0.730337,0.397077,0.585366,1.0,4
9,1.0,0.693662,0.509091,0.615013,0.730337,0.500626,0.585366,0.0,4


### 학습 데이터와 테스트 데이터로 나누기

In [5]:
train_cut = 500

# 학습용 데이터
x_train = data.iloc[0:train_cut, :-1].values  # .values로 numpy 배열로 변환
y_train = data.iloc[0:train_cut, -1].values - 1 # 1부터 시작하기에 모든 값에 -1

# 테스트용 데이터
x_test = data.iloc[train_cut:, :-1].values
y_test = data.iloc[train_cut:, -1].values

# 각 데이터셋의 크기 출력
print("data.shape: ",data.shape)
print("x_train.shape: ", x_train.shape)
print("y_train.shape: ", y_train.shape)
print("x_test.shape: ", x_test.shape)
print("y_test.shape: ", y_test.shape)

# numpy 배열을 PyTorch 텐서로 변환
x_train_tensor = torch.FloatTensor(x_train)
y_train_tensor = torch.LongTensor(y_train)
x_test_tensor = torch.FloatTensor(x_train)
y_test_tensor = torch.LongTensor(y_train)

# 텐서 크기 출력
print("x_train_tensor.shape: ", x_train_tensor.shape)
print("y_train_tensor.shape: ", y_train_tensor.shape)

print("x_test_tensor.shape: ", x_test_tensor.shape)
print("y_test_tensor.shape: ", y_test_tensor.shape)

data.shape:  (700, 9)
x_train.shape:  (500, 8)
y_train.shape:  (500,)
x_test.shape:  (200, 8)
y_test.shape:  (200,)
x_train_tensor.shape:  torch.Size([500, 8])
y_train_tensor.shape:  torch.Size([500])
x_test_tensor.shape:  torch.Size([500, 8])
y_test_tensor.shape:  torch.Size([500])


### Classfication Model 만들기

In [6]:
input_node_num = x_train.shape[1]
ouptu_node_num = len(data['User Behavior Class'].unique())
print(input_node_num,ouptu_node_num)

class SoftmaxClassifierModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_node_num, ouptu_node_num)
    def forward(self, x):
        return self.linear(x)
    
#hyper parameter
learning_rate = 0.1
epoch = 5000
model = SoftmaxClassifierModel()
# optimizer 설정
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

8 5


In [7]:
def train(model, optimizer, x_train, y_train, epoch):
    nb_epochs = epoch
    for epoch in range(nb_epochs):

        # H(x) 계산
        prediction = model(x_train)

        # cost 계산
        cost = F.cross_entropy(prediction, y_train)

        # cost로 H(x) 개선
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()

        if(epoch % 10 == 0):
            print('Epoch {:4d}/{} Cost: {:.6f}'.format(
                epoch, nb_epochs, cost.item()
            ))

In [8]:
def test(model, optimizer, x_test, y_test):
    prediction = model(x_test)
    predicted_classes = prediction.max(1)[1]
    correct_count = (predicted_classes == y_test).sum().item()
    cost = F.cross_entropy(prediction, y_test)

    print('Accuracy: {}% Cost: {:.6f}'.format(
         correct_count / len(y_test) * 100, cost.item()
    ))

In [9]:
train(model,optimizer,x_train_tensor,y_train_tensor,epoch)

Epoch    0/5000 Cost: 1.565584
Epoch   10/5000 Cost: 1.506466
Epoch   20/5000 Cost: 1.464320
Epoch   30/5000 Cost: 1.429555
Epoch   40/5000 Cost: 1.398813
Epoch   50/5000 Cost: 1.370916
Epoch   60/5000 Cost: 1.345345
Epoch   70/5000 Cost: 1.321783
Epoch   80/5000 Cost: 1.299986
Epoch   90/5000 Cost: 1.279754
Epoch  100/5000 Cost: 1.260917
Epoch  110/5000 Cost: 1.243323
Epoch  120/5000 Cost: 1.226842
Epoch  130/5000 Cost: 1.211363
Epoch  140/5000 Cost: 1.196785
Epoch  150/5000 Cost: 1.183021
Epoch  160/5000 Cost: 1.169995
Epoch  170/5000 Cost: 1.157641
Epoch  180/5000 Cost: 1.145898
Epoch  190/5000 Cost: 1.134715
Epoch  200/5000 Cost: 1.124045
Epoch  210/5000 Cost: 1.113847
Epoch  220/5000 Cost: 1.104083
Epoch  230/5000 Cost: 1.094721
Epoch  240/5000 Cost: 1.085731
Epoch  250/5000 Cost: 1.077085
Epoch  260/5000 Cost: 1.068761
Epoch  270/5000 Cost: 1.060735
Epoch  280/5000 Cost: 1.052987
Epoch  290/5000 Cost: 1.045502
Epoch  300/5000 Cost: 1.038261
Epoch  310/5000 Cost: 1.031250
Epoch  3

In [10]:
test(model,optimizer,x_test_tensor,y_test_tensor)

Accuracy: 99.0% Cost: 0.438952
