## 프로그래머스 머신러닝 과제 테스트

### 과제
6개의 column과 696timestamp를 가진 데이터들을 이용해  
10을 제외한 0~15까지의 클래스를 분류하는 문제.  

### 모델
시계열 분류에서 가장 기본이 되는 LSTM과 GRU를 학습한 뒤 더 나은 모델을 사용.  

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset

from pytorch_lightning.callbacks.early_stopping import EarlyStopping


from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score, classification_report

import pandas as pd
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [57]:
batch_size = 256
lr = 0.0001

epochs = 300
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cuda:0


In [3]:
dataset0_classes = os.listdir('dataset0/train/')
dataset0_label_encoder = LabelEncoder()
dataset0_onehot_encoder = OneHotEncoder()

dataset0_onehot_encoder.fit(np.array(dataset0_classes).reshape(-1,1))
print(dataset0_onehot_encoder.categories_)

dataset1_classes = os.listdir('dataset1/train/')
dataset1_label_encoder = LabelEncoder()
dataset1_onehot_encoder = OneHotEncoder()

dataset1_onehot_encoder.fit(np.array(dataset1_classes).reshape(-1,1))
print(dataset1_onehot_encoder.categories_)

[array(['class0', 'class1', 'class11', 'class12', 'class13', 'class14',
       'class15', 'class2', 'class3', 'class4', 'class5', 'class6',
       'class7', 'class8', 'class9'], dtype='<U7')]
[array(['class0', 'class1', 'class10', 'class2', 'class3', 'class4',
       'class5', 'class6', 'class7', 'class8', 'class9'], dtype='<U7')]


In [4]:
dataset0_onehot_encoder.transform([['class0'], ['class11']]).toarray()

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [5]:
dataset0_dir = 'dataset0/'
dataset1_dir = 'dataset1/'

dataset0_train_dir = dataset0_dir + 'train/'
dataset0_test_dir = dataset0_dir + 'test/'

dataset1_train_dir = dataset1_dir + 'train/'
dataset1_test_dir = dataset1_dir + 'test/'

In [6]:
def create_dataset(dataset_dir):
  X, y = [], []
  labels = os.listdir(dataset_dir)
  for label in labels:
    file_list = os.listdir(dataset_dir + label + '/')
    for f in file_list:
      temp = pd.read_csv(dataset_dir + label + '/' + f)
      X.append(torch.from_numpy(temp.values))
      y.append([label])
  X = pad_sequence(X, batch_first=True)
  return X, y

In [7]:
# dataset0's training/test datasets & dataloader
dataset0_X_train, dataset0_y_train = create_dataset(dataset0_train_dir)
dataset0_y_train = dataset0_onehot_encoder.transform(dataset0_y_train).toarray()

dataset0_X_test, dataset0_y_test = create_dataset(dataset0_test_dir)
dataset0_y_test = dataset0_onehot_encoder.transform(dataset0_y_test).toarray()

dataset0_train_dataset = TensorDataset(torch.tensor(dataset0_X_train).float(), torch.from_numpy(dataset0_y_train))
dataset0_test_dataset = TensorDataset(torch.tensor(dataset0_X_test).float(), torch.from_numpy(dataset0_y_test))

dataset0_train_dataloader = DataLoader(dataset0_train_dataset,
                                       batch_size=batch_size, shuffle=True, num_workers=2)
dataset0_test_dataloader= DataLoader(dataset0_test_dataset,
                                     batch_size=batch_size, shuffle=True, num_workers=2)

  
  if __name__ == '__main__':


In [8]:
print('Training dataset size:', len(dataset0_train_dataset))
print('Test dataset size:', len(dataset0_test_dataset))


Training dataset size: 13961
Test dataset size: 6975


In [9]:
def train(model, optimizer, train_dataloader):
    model.train()
    for indexs, (values, labels) in enumerate(train_dataloader):
        values = values.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(values)

        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()

In [10]:
def evaluate(model, test_dataloader):
    model.eval()
    corrects, total_loss, f1score = 0, 0, 0
    for indexs, (values, labels) in enumerate(test_dataloader):
        values = values.to(device)
        labels = labels.to(device)

        outputs = model(values)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        _, targets = torch.max(labels, 1)
        corrects += preds.eq(targets).sum().item()
        f1score += f1_score(targets.cpu(), preds.cpu(), average='macro')
        
    size = len(test_dataloader.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    avg_f1score = 100.0 * f1score / size
    return avg_loss, avg_accuracy, avg_f1score

### LSTM
lstm 모델을 사용한 분류

In [24]:
# 모델 생성 작성
class model(nn.Module):
  def __init__(self, input_size, hidden_size, n_layers, num_classes):
    super(model, self).__init__()
    """
    코드 작성하세요
    """
    self.n_layers = n_layers
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_classes = num_classes
    
    self.LSTM = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True)
    self.fc = nn.Linear(self.hidden_size, self.num_classes)
    
  def forward(self, x):
    """
    코드 작성하세요
    """
    x, _ = self.LSTM(x)
    h_t = x[:, -1, :]
    output = self.fc(h_t)
    
    return output



In [25]:
import gc
gc.collect()
torch.cuda.empty_cache()

net = model(6, 16, 2, 15).to(device)
print(net)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

model(
  (LSTM): LSTM(6, 16, batch_first=True)
  (fc1): Linear(in_features=16, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=15, bias=True)
  (relu): ReLU()
)


In [26]:
best_val_loss = None
for e in range(1, epochs+1):
    train(net, optimizer, dataset0_train_dataloader)
    val_loss, val_corrects, val_f1 = evaluate(net, dataset0_test_dataloader)

    print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f | val_f1 : %5.2f" % (e, val_loss, val_corrects, val_f1))

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(net.state_dict(), './snapshot/lstmclassification.pt')
        best_val_loss = val_loss

[Epoch: 1] val loss :  0.01 | val accuracy : 10.85 | val_f1 :  0.02
[Epoch: 2] val loss :  0.01 | val accuracy : 17.98 | val_f1 :  0.02
[Epoch: 3] val loss :  0.01 | val accuracy : 24.33 | val_f1 :  0.03
[Epoch: 4] val loss :  0.01 | val accuracy : 26.41 | val_f1 :  0.03
[Epoch: 5] val loss :  0.01 | val accuracy : 26.70 | val_f1 :  0.03
[Epoch: 6] val loss :  0.01 | val accuracy : 27.15 | val_f1 :  0.03
[Epoch: 7] val loss :  0.01 | val accuracy : 27.53 | val_f1 :  0.03
[Epoch: 8] val loss :  0.01 | val accuracy : 28.24 | val_f1 :  0.03
[Epoch: 9] val loss :  0.01 | val accuracy : 29.92 | val_f1 :  0.03
[Epoch: 10] val loss :  0.01 | val accuracy : 29.18 | val_f1 :  0.03
[Epoch: 11] val loss :  0.01 | val accuracy : 30.02 | val_f1 :  0.03
[Epoch: 12] val loss :  0.01 | val accuracy : 31.34 | val_f1 :  0.03
[Epoch: 13] val loss :  0.01 | val accuracy : 32.57 | val_f1 :  0.04
[Epoch: 14] val loss :  0.01 | val accuracy : 33.13 | val_f1 :  0.04
[Epoch: 15] val loss :  0.01 | val accuracy

### GRU
GRU 모델을 사용한 분류

In [54]:
class GRU(nn.Module):
  def __init__(self, input_size, hidden_size, n_layers, num_classes):
    super(GRU, self).__init__()
    """
    코드 작성하세요
    """
    self.n_layers = n_layers
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_classes = num_classes
    
    self.GRU = nn.GRU(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True)
    self.fc = nn.Linear(self.hidden_size, self.num_classes)
    
  def forward(self, x):
    """
    코드 작성하세요
    """
    x, _ = self.GRU(x)
    h_t = x[:, -1, :]
    output = self.fc(h_t)
    
    return output

In [59]:
import gc
gc.collect()
torch.cuda.empty_cache()

gru_net = GRU(6, 16, 2, 15).to(device)
print(gru_net)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gru_net.parameters(), lr=lr)

GRU(
  (GRU): GRU(6, 16, batch_first=True)
  (fc): Linear(in_features=16, out_features=15, bias=True)
)


In [60]:
best_val_loss = None
for e in range(1, epochs+1):
    train(gru_net, optimizer, dataset0_train_dataloader)
    val_loss, val_corrects, val_f1 = evaluate(gru_net, dataset0_test_dataloader)

    print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f | val_f1 : %5.2f" % (e, val_loss, val_corrects, val_f1))

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(gru_net.state_dict(), './snapshot/gruclassification.pt')
        best_val_loss = val_loss

[Epoch: 1] val loss :  0.01 | val accuracy :  5.79 | val_f1 :  0.01
[Epoch: 2] val loss :  0.01 | val accuracy :  6.04 | val_f1 :  0.01
[Epoch: 3] val loss :  0.01 | val accuracy :  7.56 | val_f1 :  0.01
[Epoch: 4] val loss :  0.01 | val accuracy :  8.46 | val_f1 :  0.02
[Epoch: 5] val loss :  0.01 | val accuracy : 26.29 | val_f1 :  0.03
[Epoch: 6] val loss :  0.01 | val accuracy : 28.65 | val_f1 :  0.03
[Epoch: 7] val loss :  0.01 | val accuracy : 28.60 | val_f1 :  0.03
[Epoch: 8] val loss :  0.01 | val accuracy : 28.82 | val_f1 :  0.03
[Epoch: 9] val loss :  0.01 | val accuracy : 29.43 | val_f1 :  0.03
[Epoch: 10] val loss :  0.01 | val accuracy : 29.95 | val_f1 :  0.03
[Epoch: 11] val loss :  0.01 | val accuracy : 30.67 | val_f1 :  0.03
[Epoch: 12] val loss :  0.01 | val accuracy : 31.17 | val_f1 :  0.03
[Epoch: 13] val loss :  0.01 | val accuracy : 31.60 | val_f1 :  0.03
[Epoch: 14] val loss :  0.01 | val accuracy : 32.04 | val_f1 :  0.03
[Epoch: 15] val loss :  0.01 | val accuracy

In [35]:
# dataset1's training datasets & dataloader
dataset1_X_train, dataset1_y_train = create_dataset(dataset1_train_dir)
dataset1_y_train = dataset1_onehot_encoder.transform(dataset1_y_train).toarray()

dataset1_train_dataset = TensorDataset(torch.tensor(dataset1_X_train).float(), torch.from_numpy(dataset1_y_train))
dataset1_train_dataloader = DataLoader(dataset1_train_dataset,
                                       batch_size=batch_size)

  """


### 테스트 데이터 적용
200 epoch시 성능이 더 좋았던 GRU 사용.

In [61]:
net = GRU(6, 16, 2, 15).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

checkpoint = torch.load('./snapshot/gruclassification.pt')
net.load_state_dict(checkpoint)


<All keys matched successfully>

In [62]:
net.fc = nn.Linear(16, 11)
net = net.to(device)
net

GRU(
  (GRU): GRU(6, 16, batch_first=True)
  (fc): Linear(in_features=16, out_features=11, bias=True)
)

In [63]:
def train(model, optimizer, train_dataloader):
    model.train()
    for indexs, (values, labels) in enumerate(train_dataloader):
        values = values.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(values)

        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()

In [65]:
best_val_loss = None
for epoch in range(1, epochs//2 + 1):
    train(net, optimizer, dataset1_train_dataloader)
    

In [66]:
dataset1_test_X = []
test_list = os.listdir(dataset1_test_dir)
for f in test_list:
  temp = pd.read_csv(dataset1_test_dir + f)
  dataset1_test_X.append(torch.from_numpy(temp.values))

dataset1_test_X = pad_sequence(dataset1_test_X, batch_first=True)
print(dataset1_test_X.dtype)
dataset1_test_dataset = TensorDataset(dataset1_test_X)
dataset1_test_dataloader = DataLoader(dataset1_test_dataset, batch_size=batch_size, shuffle=False)

torch.float64


In [None]:
test_list

['103.csv',
 '119.csv',
 '162.csv',
 '172.csv',
 '178.csv',
 '179.csv',
 '188.csv',
 '191.csv',
 '201.csv',
 '230.csv',
 '245.csv',
 '265.csv',
 '276.csv',
 '281.csv',
 '298.csv',
 '300.csv',
 '308.csv',
 '328.csv',
 '342.csv',
 '344.csv',
 '345.csv',
 '346.csv',
 '347.csv',
 '348.csv',
 '349.csv',
 '350.csv',
 '351.csv',
 '352.csv',
 '363.csv',
 '376.csv',
 '38.csv',
 '400.csv',
 '405.csv',
 '406.csv',
 '414.csv',
 '422.csv',
 '423.csv',
 '425.csv',
 '427.csv',
 '430.csv',
 '433.csv',
 '441.csv',
 '445.csv',
 '449.csv',
 '46.csv',
 '463.csv',
 '470.csv',
 '474.csv',
 '475.csv',
 '478.csv',
 '487.csv',
 '489.csv',
 '491.csv',
 '496.csv',
 '507.csv',
 '508.csv',
 '509.csv',
 '510.csv',
 '511.csv',
 '512.csv',
 '513.csv',
 '514.csv',
 '515.csv',
 '516.csv',
 '517.csv',
 '518.csv',
 '519.csv',
 '520.csv',
 '521.csv',
 '522.csv',
 '523.csv',
 '524.csv',
 '525.csv',
 '526.csv',
 '527.csv',
 '528.csv',
 '529.csv',
 '530.csv',
 '534.csv',
 '536.csv',
 '539.csv',
 '545.csv',
 '550.csv',
 '559.

In [67]:
# dataset1의 test 데이터를 사용한 일반화 성능 확인

net.eval()
predicted = []
with torch.no_grad():
  for idx, x in enumerate(dataset1_test_dataloader):
    x = x[0]
    x = x.to(device)
    optimizer.zero_grad()
    output = net(x.float())

    _, preds = torch.max(output, 1)
    predicted.extend(preds.cpu().numpy())
  torch.cuda.empty_cache()

In [68]:
l = len(predicted)
answer = []
for p in predicted:
    temp = [0 for _ in range(11)]
    temp[p] = 1
    temp2 = dataset1_onehot_encoder.inverse_transform(np.array(temp).reshape(1,-1))
    temp2 = temp2.tolist()
    temp2 = temp2[0][0]
    answer.append(temp2)
print(answer)

['class7', 'class1', 'class8', 'class5', 'class7', 'class8', 'class5', 'class5', 'class5', 'class5', 'class4', 'class8', 'class1', 'class7', 'class1', 'class7', 'class8', 'class7', 'class7', 'class7', 'class5', 'class5', 'class7', 'class1', 'class1', 'class8', 'class8', 'class5', 'class8', 'class5', 'class8', 'class8', 'class8', 'class5', 'class4', 'class7', 'class5', 'class7', 'class7', 'class5', 'class5', 'class4', 'class5', 'class8', 'class7', 'class8', 'class8', 'class7', 'class4', 'class10', 'class8', 'class4', 'class4', 'class8', 'class8', 'class8', 'class8', 'class1', 'class7', 'class5', 'class7', 'class4', 'class7', 'class8', 'class8', 'class7', 'class1', 'class8', 'class8', 'class8', 'class7', 'class4', 'class1', 'class7', 'class7', 'class7', 'class4', 'class8', 'class8', 'class7', 'class7', 'class8', 'class7', 'class8', 'class1', 'class8', 'class8', 'class8', 'class5', 'class1', 'class7', 'class8', 'class5', 'class5', 'class7', 'class8', 'class8', 'class8', 'class8', 'class7'

In [69]:
df = {}
for i, tl in enumerate(test_list):
    df[int(tl[:-4])] = answer[i]

pd_preds = pd.DataFrame.from_dict(df, orient='index').rename(columns={0:'label'})
pd_preds = pd_preds.sort_index()
pd_preds.to_csv('submission.csv')
pd_preds.head()

Unnamed: 0,label
9,class8
38,class8
46,class7
65,class5
79,class5
