In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import random
import os

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
SEED = 5
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1c270a5f4b0>

In [3]:
lr = 0.0001
EPOCHS = 50
BATCH_SIZE = 64

## train data explore

In [4]:
df = pd.read_csv('./dataset/train.csv', usecols=['text','author'])
df.head()

Unnamed: 0,text,author
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [5]:
df.shape

(54879, 2)

In [6]:
n_classes = len(df['author'].unique())
print(n_classes)

5


## 전처리
### 텍스트 전처리
- 특수문자 제거

In [7]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

In [8]:
#df['text'] = df['text'].str.lower() # 전부다 소문자로 변경 / 나중에 torchtext에서 해주기때문에 안해도됨
df['text'] = df['text'].apply(alpha_num) # 특수문자 제거
df.head()

Unnamed: 0,text,author
0,He was almost choking There was so much so muc...,3
1,Your sister asked for it I suppose,2
2,She was engaged one day as she walked in peru...,1
3,The captain was in the porch keeping himself c...,4
4,Have mercy gentlemen odin flung up his hands D...,3


### 데이터셋 분리
sklearn의 train_test_split으로 분리하고 각각 csv 파일로 저장

In [9]:
train_df, val_df = train_test_split(df, test_size=0.2, shuffle=False)
#train_df, val_df = train_test_split(train_df, test_size=0.2, shuffle=False)

print(len(train_df))
print(len(val_df))
#print(len(test_df))

43903
10976


In [10]:
train_df.to_csv("train_data.csv", index=False)
val_df.to_csv("val_data.csv", index=False)
#test_df.to_csv("test_data.csv", index=False)

## dataset

In [11]:
from torchtext import data
from torchtext.data import TabularDataset

In [12]:
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split,
                  lower=True,
                  batch_first=True,
                  fix_length=20)

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)



In [13]:
train_data, val_data= TabularDataset.splits(
    path = '.', 
    train='train_data.csv', 
    validation='val_data.csv', 
    #test='test_data.csv', 
    format='csv',
    fields=[('text', TEXT), ('label', LABEL)],
    skip_header=True)



In [14]:
print('훈련 데이터의 크기 : {}' .format(len(train_data)))
print('검증 데이터의 크기 : {}' .format(len(val_data)))
#print('테스트 데이터의 크기 : {}' .format(len(test_data)))

훈련 데이터의 크기 : 43903
검증 데이터의 크기 : 10976


In [15]:
print(vars(train_data[0]))

{'text': ['he', 'was', 'almost', 'choking', 'there', 'was', 'so', 'much', 'so', 'much', 'he', 'wanted', 'to', 'say', 'but', 'strange', 'exclamations', 'were', 'all', 'that', 'came', 'from', 'his', 'lips', 'the', 'pole', 'gazed', 'fixedly', 'at', 'him', 'at', 'the', 'bundle', 'of', 'notes', 'in', 'his', 'hand', 'looked', 'at', 'odin', 'and', 'was', 'in', 'evident', 'perplexity'], 'label': '3'}


In [16]:
print(vars(val_data[0]))

{'text': ['why', 'you', 'seem', 'to', 'take', 'me', 'for', 'little', 'odin', 'said', 'odin', 'with', 'a', 'grin', 'of', 'irritation', 'but', 'please', 'dont', 'suppose', 'i', 'am', 'such', 'a', 'revolutionist', 'i', 'often', 'disagree', 'with', 'mr', 'odin', 'though', 'i', 'mention', 'tatyana', 'i', 'am', 'not', 'at', 'all', 'for', 'the', 'emancipation', 'of', 'women', 'i', 'acknowledge', 'that', 'women', 'are', 'a', 'subject', 'race', 'and', 'must', 'obey', 'les', 'femmes', 'tricottent', 'as', 'napoleon', 'said', 'odin', 'for', 'some', 'reason', 'smiled', 'and', 'on', 'that', 'question', 'at', 'least', 'i', 'am', 'quite', 'of', 'one', 'mind', 'with', 'that', 'pseudogreat', 'man', 'i', 'think', 'too', 'that', 'to', 'leave', 'ones', 'own', 'country', 'and', 'fly', 'to', 'america', 'is', 'mean', 'worse', 'than', 'meansilly', 'why', 'go', 'to', 'america', 'when', 'one', 'may', 'be', 'of', 'great', 'service', 'to', 'humanity', 'here', 'now', 'especially', 'theres', 'a', 'perfect', 'mass', 

## Vocabulary 생성
## 직접 빌드

In [17]:
TEXT.build_vocab(train_data, min_freq=5, max_size=10000)
LABEL.build_vocab(train_data)

vocab_size = len(TEXT.vocab)
print('단어 집합의 크기 : {}'.format(vocab_size))
print('클래스의 개수 : {}'.format(n_classes))

단어 집합의 크기 : 10002
클래스의 개수 : 5


## Dataloader

In [18]:
train_iter, val_iter = data.BucketIterator.splits(
    (train_data, val_data), batch_size=BATCH_SIZE,
    shuffle=True, 
    repeat=False,
    sort=False)

print('훈련 데이터의 미니 배치의 개수 : {}'.format(len(train_iter)))
#print('테스트 데이터의 미니 배치의 개수 : {}'.format(len(test_iter)))
print('검증 데이터의 미니 배치의 개수 : {}'.format(len(val_iter)))

훈련 데이터의 미니 배치의 개수 : 686
검증 데이터의 미니 배치의 개수 : 172




## Model

### device 선택

In [19]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
#DEVICE = 'cpu'
print("cpu와 cuda 중 다음 기기로 학습함:", DEVICE)


cpu와 cuda 중 다음 기기로 학습함: cuda


### Model build

In [20]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
        x, _ = self.gru(x, h_0)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
        h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
        self.dropout(h_t)
        logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [21]:
model = GRU(3, 256, vocab_size, 128, n_classes, 0.5).to(DEVICE)


### optimizer

In [22]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## Train
### Train code

In [23]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        #y.data.sub_(1)  # 레이블 값을 0과 1로 변환
        optimizer.zero_grad()

        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

### evaluate code

In [24]:
def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for b, batch in enumerate(val_iter):
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        #y.data.sub_(1) # 레이블 값을 0과 1로 변환
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    #print(size)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

## Run

In [25]:
best_val_loss = None
for e in range(1, EPOCHS+1):
    train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)

    print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (e, val_loss, val_accuracy))

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(model.state_dict(), './snapshot/dacon.pt')
        best_val_loss = val_loss



[Epoch: 1] val loss :  1.33 | val accuracy : 44.73
[Epoch: 2] val loss :  1.22 | val accuracy : 51.26
[Epoch: 3] val loss :  1.16 | val accuracy : 53.02
[Epoch: 4] val loss :  1.11 | val accuracy : 55.70
[Epoch: 5] val loss :  1.08 | val accuracy : 57.63
[Epoch: 6] val loss :  1.06 | val accuracy : 58.78
[Epoch: 7] val loss :  1.06 | val accuracy : 58.34
[Epoch: 8] val loss :  1.03 | val accuracy : 60.42
[Epoch: 9] val loss :  1.04 | val accuracy : 60.33
[Epoch: 10] val loss :  1.01 | val accuracy : 61.97
[Epoch: 11] val loss :  1.03 | val accuracy : 61.56
[Epoch: 12] val loss :  1.03 | val accuracy : 62.29
[Epoch: 13] val loss :  1.06 | val accuracy : 62.54
[Epoch: 14] val loss :  1.08 | val accuracy : 62.31
[Epoch: 15] val loss :  1.10 | val accuracy : 62.57
[Epoch: 16] val loss :  1.13 | val accuracy : 61.95
[Epoch: 17] val loss :  1.14 | val accuracy : 63.03
[Epoch: 18] val loss :  1.18 | val accuracy : 62.25
[Epoch: 19] val loss :  1.21 | val accuracy : 61.86
[Epoch: 20] val loss 