# 첫 시도에서는 RNN 대신 GRU를 사용했고 dropout rate를 조금 조정했습니다. *accuracy는 약 0.5정도로 증가한 것을 확인할 수 있었습니다*

In [1]:
import os
import torch 
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import random

import pandas as pd
import re

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [4]:
# 하이퍼파라미터
batch_size = 64
lr = 0.001
epochs = 30

In [6]:
#파일 불러오기
train = pd.read_csv('/content/train.csv', encoding = 'utf-8') #한글의 경우 encoding으로 utf-8, ms949,cp949
test = pd.read_csv('/content/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('/content/sample_submission.csv', encoding = 'utf-8')

In [None]:
train.head(10)

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
5,5,"""It was well fought,"" he said, ""and, by my soo...",4
6,6,"Not to pay him was impossible, considering his...",3
7,7,"“A proper figure of a man at-arms,” said the l...",2
8,8,"'You were not here last Sunday night,' he said.",0
9,9,“You must not ask me that!” I cried. “Hell may...,4


In [None]:
train.iloc[0,1]

'He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him, at the bundle of notes in his hand; looked at odin, and was in evident perplexity.'

In [None]:
pd.DataFrame(sample_submission.iloc[0,:]) # 5명의 작가(target var)

Unnamed: 0,0
index,0
0,0
1,0
2,0
3,0
4,0


In [7]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text']=train['text'].apply(alpha_num)

In [None]:
train.iloc[0,1]# 문장 부호가 모두 없어진 것을 확인할 수 있음


'He was almost choking There was so much so much he wanted to say but strange exclamations were all that came from his lips The Pole gazed fixedly at him at the bundle of notes in his hand looked at odin and was in evident perplexity'

In [8]:
import nltk
from nltk.corpus import stopwords 

nltk.download('stopwords')
stopwords = stopwords.words('english')

print(stopwords[:10])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [None]:
!pip install konlpy
!sudo apt-get install curl git
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [10]:
from konlpy.tag import Mecab

text = "늦잠 자고 싶어요"
tokenizer = Mecab()
print(tokenizer.morphs(text))

['늦잠', '자', '고', '싶', '어요']


In [11]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

text = "늦잠 자고 싶어요"
tokenizer.tokenize(text)

['늦잠', '자고', '싶어요']

In [12]:
def remove_stopwords(text):
    final_text = []
    words = tokenizer.tokenize(text)
    for word in words:
        if word.strip().lower() not in stopwords:
          final_text.append(word.strip())
    return  " ".join(final_text)


train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [None]:
train.head(10)


Unnamed: 0,index,text,author
0,0,almost choking much much wanted say strange ex...,3
1,1,sister asked suppose,2
2,2,engaged one day walked perusing janes last let...,1
3,3,captain porch keeping carefully way treacherou...,4
4,4,mercy gentlemen odin flung hands dont write an...,3
5,5,well fought said sooth charge us twice,4
6,6,pay impossible considering character talk fell...,3
7,7,proper figure man atarms said little knight ma...,2
8,8,last sunday night said,0
9,9,must ask cried hell may noble flames known sco...,4


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer(max_features = 200) # max_features를 통해 최대 몇 개의 단어를 벡터로 바꿀 것인지 결정합니다. 참고 max_features 설정 안 했더니 제 커널은 죽었어요ㅠㅠ
v.fit(train['text']) # test 때는 train 에서 학습된 tf-idf를 이용해야 하기 때문에 여기서는 fit_transform을 한꺼번에 쓰는 대신, 나눠서 이용합니다 ~ sklearn에서 여타 모델들과 마찬가지로 생각하시면 됩니다


TfidfVectorizer(max_features=200)

In [None]:
print(v.vocabulary_)

{'almost': 0, 'much': 113, 'say': 147, 'came': 19, 'hand': 63, 'looked': 92, 'odin': 121, 'asked': 9, 'one': 125, 'day': 26, 'last': 79, 'letter': 85, 'mr': 111, 'saw': 146, 'looking': 93, 'away': 10, 'said': 144, 'way': 185, 'turned': 178, 'us': 182, 'take': 164, 'side': 153, 'hands': 64, 'men': 103, 'dont': 31, 'heart': 68, 'oh': 123, 'god': 56, 'well': 186, 'another': 4, 'time': 174, 'man': 98, 'little': 89, 'yet': 198, 'see': 148, 'great': 61, 'upon': 181, 'would': 194, 'two': 179, 'could': 23, 'put': 134, 'though': 171, 'night': 119, 'must': 114, 'ask': 8, 'cried': 25, 'may': 101, 'years': 196, 'always': 2, 'house': 72, 'new': 117, 'began': 12, 'things': 169, 'knew': 76, 'want': 184, 'next': 118, 'morning': 109, 'moment': 107, 'think': 170, 'end': 34, 'thought': 172, 'quite': 136, 'still': 159, 'ever': 38, 'table': 163, 'passed': 128, 'already': 1, 'indeed': 75, 'come': 22, 'back': 11, 'took': 177, 'long': 90, 'returned': 140, 'many': 99, 'old': 124, 'really': 138, 'even': 36, 'fe

In [15]:
x = v.transform(train['text']).toarray()
print(x)

[[0.39448743 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.5468209  0.         0.        ]]


In [None]:
print(x.shape) # 200개의 max 단어를 설정했기 때문에 문장수, 단어수 이렇게 array를 만들어진 것을 확인할 수 있다
print(train.shape)

(54879, 200)
(54879, 3)


In [16]:
x = pd.DataFrame(x)
temp = []
for i in range(len(x)):
  temp.append(list(x.iloc[i,:]))
train['preprocessed_text'] = temp

In [None]:
train.head(10)

Unnamed: 0,index,text,author,preprocessed_text
0,0,almost choking much much wanted say strange ex...,3,"[0.3944874313763683, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,sister asked suppose,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,engaged one day walked perusing janes last let...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,captain porch keeping carefully way treacherou...,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,mercy gentlemen odin flung hands dont write an...,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,5,well fought said sooth charge us twice,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,6,pay impossible considering character talk fell...,3,"[0.0, 0.0, 0.0, 0.0, 0.7700429756857019, 0.0, ..."
7,7,proper figure man atarms said little knight ma...,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,8,last sunday night said,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,9,must ask cried hell may noble flames known sco...,4,"[0.0, 0.0, 0.8115642124736752, 0.0, 0.0, 0.0, ..."


In [17]:
#워드 임베딩
word_set = []
max_len = 0

for d in train['text']:
  word_set = word_set + d.split(' ') # 여기에 토큰화한 데이터가 들어가면 됩니다
  if len(d.split()) > max_len:
    max_len = len(d.split())
  
word_set = set(word_set)

In [18]:
word_to_idx = {word: i+1 for i, word in enumerate(word_set)}
print(len(word_set))
print(max_len)

47120
212


In [19]:
def word_to_key(text):
  final_text = []
  for word in text.split():
      final_text.append(word_to_idx[word])
  if len(final_text) < max_len:
    final_text = final_text + [0] * (max_len - len(final_text))
  return final_text


train['word_to_key'] = train['text'].apply(word_to_key)

In [None]:
train.head(10)

Unnamed: 0,index,text,author,preprocessed_text,word_to_key
0,0,almost choking much much wanted say strange ex...,3,"[0.3944874313763683, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[15338, 25849, 16568, 16568, 31676, 30858, 454..."
1,1,sister asked suppose,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[38764, 10325, 35428, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,engaged one day walked perusing janes last let...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[38333, 27529, 15813, 24009, 29468, 29450, 243..."
3,3,captain porch keeping carefully way treacherou...,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[33209, 40310, 42591, 7626, 43277, 36936, 1066..."
4,4,mercy gentlemen odin flung hands dont write an...,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[44528, 37540, 18686, 45728, 40166, 40959, 411..."
5,5,well fought said sooth charge us twice,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[27580, 12325, 36244, 26128, 18107, 41616, 138..."
6,6,pay impossible considering character talk fell...,3,"[0.0, 0.0, 0.0, 0.0, 0.7700429756857019, 0.0, ...","[34589, 23684, 44865, 8622, 16060, 5661, 4057,..."
7,7,proper figure man atarms said little knight ma...,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[44460, 40625, 26620, 42760, 36244, 41357, 806..."
8,8,last sunday night said,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[24367, 24549, 5475, 36244, 0, 0, 0, 0, 0, 0, ..."
9,9,must ask cried hell may noble flames known sco...,4,"[0.0, 0.0, 0.8115642124736752, 0.0, 0.0, 0.0, ...","[31246, 27592, 28391, 28136, 8466, 47110, 3521..."


In [20]:
X_train = train.iloc[:45000, 4]
X_test = train.iloc[45000:, 4].reset_index(drop=True)

y_train = train.iloc[:45000, 2]
y_test = train.iloc[45000:, 2].reset_index(drop=True)

In [21]:
class CustomDataset(Dataset):
  def __init__(self):
    
    self.x_data = X_train
    self.y_data = [[y] for y in y_train]

  def __len__(self):

    return len(self.x_data)

  def __getitem__(self, idx):

    x = torch.LongTensor(self.x_data[idx]).to(device)
    y = torch.LongTensor(self.y_data[idx]).to(device)

    return x,y

In [22]:
class CustomDataset_test(Dataset):
  def __init__(self):
    
    self.x_data = X_test
    self.y_data = [[y] for y in y_test]

  def __len__(self):

    return len(self.x_data)

  def __getitem__(self, idx):

    x = torch.LongTensor(self.x_data[idx])
    y = torch.LongTensor(self.y_data[idx])

    return x,y

In [23]:
dataset = CustomDataset()
dataloader = DataLoader(dataset, batch_size=batch_size) #sampler를 만들었다면, sampler를 파라미터로 넣어줄 수 있음 : https://hyelimkungkung.tistory.com/29?category=935193 참고 ㅎㅎ


In [27]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.3):
      super(GRU, self).__init__()
      self.n_layers = n_layers
      self.hidden_dim = hidden_dim

      self.embed = nn.Embedding(n_vocab, embed_dim)
      self.dropout = nn.Dropout(dropout_p)
      self.gru = nn.GRU(embed_dim, self.hidden_dim,
                        num_layers=self.n_layers,
                        batch_first=True)
     # self.rnn = nn.RNN(embed_dim, self.hidden_dim,batch_first = True)
      self.out = nn.Sequential(
          nn.Linear(self.hidden_dim, n_classes),
          nn.Softmax()
      )
    def forward(self, x):
      x = self.embed(x)
      h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
      x, _ = self.gru(x, h_0)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
     # x, _ = self.rnn(x,h_0)
      h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
      self.dropout(h_t)
      logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
      return logit

    def _init_state(self, batch_size=1):
      weight = next(self.parameters()).data
      return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [25]:
n_vocab = 47120+1
embedd_size = 5
hidden_size = 100
output_size = 5

In [28]:
net = GRU(1, 256, n_vocab, embedd_size, output_size, 0.5).to(device)


In [29]:
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(net.parameters(), lr)

In [30]:
losses = []
for epoch in range(epochs):
  
  for x, y in dataloader:
    optimizer.zero_grad()
    y = y.to(device)

    # forward 연산
    hypothesis = net(x)

    # 비용 함수
    y = y.squeeze()
    cost = criterion(hypothesis, y)
    cost.backward()
    optimizer.step()
    losses.append(cost.item()) # 값만 가져오기 위해서 .item()

  # 10의 배수에 해당되는 에포크마다 비용을 출력
  if epoch % 10 == 0:
      print(epoch, cost.item())

  input = module(input)


0 1.5175724029541016
10 1.5093647241592407
20 1.3819279670715332


In [31]:
dataset = CustomDataset_test()
test_loader = DataLoader(dataset, batch_size=batch_size)

In [32]:
correct = 0

with torch.no_grad():
  net = net.to('cpu')
  net.eval()
  for data, target in test_loader:
    data, target = data, target
    output = net(data)
    
    pred = output.max(1, keepdim=True)[1]
    # eq() 함수는 값이 일치하면 1을, 아니면 0을 출력.
    correct += pred.eq(target.view_as(pred)).sum().item()

test_accuracy = correct / len(test_loader.dataset)
print('Accuracy:', test_accuracy)

  input = module(input)


Accuracy: 0.4968114181597328


# 두 번째 시도에서는 불용어를 제거하지 않으면 결과값이 얼마나 차이날 지 궁금해서 불용어를 제거하지 않은 버전 (train2, test2)를 이용하여 모델링을 진행해보았습니다. 

하지만 마지막 부분에서 알 수 없는 에러가 떴는데 이걸 어떻게 해결해야할 지 몇 시간을 고민해도 모르겠어서.. 일단 그냥 이렇게 제출하게 되었습니다 ㅜㅜ

In [33]:
#파일 불러오기
train2 = pd.read_csv('/content/train.csv', encoding = 'utf-8') #한글의 경우 encoding으로 utf-8, ms949,cp949
test2 = pd.read_csv('/content/test_x.csv', encoding = 'utf-8')
sample_submission2 = pd.read_csv('/content/sample_submission.csv', encoding = 'utf-8')

In [36]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train2['text']=train2['text'].apply(alpha_num)

In [37]:
train2.iloc[0,1]# 문장 부호가 모두 없어진 것을 확인할 수 있음


'He was almost choking There was so much so much he wanted to say but strange exclamations were all that came from his lips The Pole gazed fixedly at him at the bundle of notes in his hand looked at odin and was in evident perplexity'

In [39]:
import nltk
from nltk.corpus import stopwords 

nltk.download('stopwords')
stopwords = stopwords.words('english')

print(stopwords[:50])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be']


In [42]:
train2['text'] = train2['text'].str.lower()
test2['text'] = test2['text'].str.lower()

In [43]:
train2.head(10)

Unnamed: 0,index,text,author
0,0,he was almost choking there was so much so muc...,3
1,1,your sister asked for it i suppose,2
2,2,she was engaged one day as she walked in peru...,1
3,3,the captain was in the porch keeping himself c...,4
4,4,have mercy gentlemen odin flung up his hands d...,3
5,5,it was well fought he said and by my sooth the...,4
6,6,not to pay him was impossible considering his ...,3
7,7,a proper figure of a man atarms said the littl...,2
8,8,you were not here last sunday night he said,0
9,9,you must not ask me that i cried hell may have...,4


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer(max_features = 200) # max_features를 통해 최대 몇 개의 단어를 벡터로 바꿀 것인지 결정합니다. 참고 max_features 설정 안 했더니 제 커널은 죽었어요ㅠㅠ
v.fit(train2['text']) # test 때는 train 에서 학습된 tf-idf를 이용해야 하기 때문에 여기서는 fit_transform을 한꺼번에 쓰는 대신, 나눠서 이용합니다 ~ sklearn에서 여타 모델들과 마찬가지로 생각하시면 됩니다


TfidfVectorizer(max_features=200)

In [45]:
print(v.vocabulary_) # 벡터화된 단어들 사전


{'he': 56, 'was': 178, 'there': 158, 'so': 142, 'much': 100, 'to': 169, 'say': 135, 'but': 22, 'were': 183, 'all': 3, 'that': 152, 'came': 24, 'from': 44, 'his': 64, 'the': 154, 'at': 13, 'him': 62, 'of': 112, 'in': 69, 'hand': 52, 'looked': 84, 'odin': 110, 'and': 7, 'your': 199, 'asked': 12, 'for': 43, 'it': 72, 'she': 139, 'one': 118, 'day': 29, 'as': 11, 'last': 76, 'on': 116, 'some': 143, 'which': 187, 'had': 51, 'not': 107, 'when': 185, 'being': 20, 'again': 2, 'by': 23, 'mr': 98, 'saw': 134, 'looking': 85, 'up': 174, 'her': 60, 'away': 14, 'said': 132, 'himself': 63, 'out': 123, 'way': 179, 'should': 140, 'any': 9, 'be': 16, 'us': 176, 'take': 149, 'if': 67, 'you': 197, 'hands': 53, 'have': 55, 'dont': 34, 'here': 61, 'my': 102, 'heart': 59, 'before': 18, 'are': 10, 'oh': 114, 'well': 181, 'they': 160, 'will': 191, 'about': 0, 'another': 8, 'time': 168, 'man': 88, 'little': 81, 'why': 190, 'no': 106, 'yet': 196, 'see': 136, 'great': 50, 'upon': 175, 'this': 163, 'would': 194, 't

In [49]:
x = v.transform(train2['text']).toarray()
print(x)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.53170786]
 [0.         0.         0.17545124 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.80200717]
 [0.         0.         0.         ... 0.66142466 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [48]:
print(x.shape) # 200개의 max 단어를 설정했기 때문에 문장수, 단어수 이렇게 array를 만들어진 것을 확인할 수 있다
print(train2.shape)

(54879, 200)
(54879, 3)


In [50]:
x = pd.DataFrame(x)
temp = []
for i in range(len(x)):
  temp.append(list(x.iloc[i,:]))
train2['preprocessed_text'] = temp

In [51]:
train2

Unnamed: 0,index,text,author,preprocessed_text
0,0,he was almost choking there was so much so muc...,3,"[0.0, 0.0, 0.0, 0.14270587334255674, 0.0, 0.0,..."
1,1,your sister asked for it i suppose,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,she was engaged one day as she walked in peru...,1,"[0.0, 0.0, 0.1754512367608544, 0.0, 0.0, 0.0, ..."
3,3,the captain was in the porch keeping himself c...,4,"[0.0, 0.0, 0.0, 0.15558320197494702, 0.0, 0.0,..."
4,4,have mercy gentlemen odin flung up his hands d...,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.21064216..."
...,...,...,...,...
54874,54874,is that you mr smith odin whispered i hardly d...,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
54875,54875,i told my plan to the captain and between us w...,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.17034619..."
54876,54876,your sincere wellwisher friend and sister luc...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.40436669..."
54877,54877,then you wanted me to lend you money,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [52]:
word_set = []
max_len = 0

for d in train2['text']:
  word_set = word_set + d.split(' ') # 여기에 토큰화한 데이터가 들어가면 됩니다
  if len(d.split()) > max_len:
    max_len = len(d.split())
  
word_set = set(word_set)

In [53]:
word_to_idx = {word: i+1 for i, word in enumerate(word_set)}
print(len(word_set))
print(max_len)

47257
471


In [54]:
def word_to_key(text):
  final_text = []
  for word in text.split():
      final_text.append(word_to_idx[word])
  if len(final_text) < max_len:
    final_text = final_text + [0] * (max_len - len(final_text))
  return final_text


train2['word_to_key'] = train2['text'].apply(word_to_key)

In [55]:
X_train2 = train2.iloc[:45000, 4]
X_test2 = train2.iloc[45000:, 4].reset_index(drop=True)

y_train2 = train2.iloc[:45000, 2]
y_test2 = train2.iloc[45000:, 2].reset_index(drop=True)

In [63]:
class CustomDataset(Dataset):
  def __init__(self):
    
    self.x_data = X_train2
    self.y_data = [[y] for y in y_train2]

  def __len__(self):

    return len(self.x_data)

  def __getitem__(self, idx):

    x = torch.LongTensor(self.x_data[idx]).to(device)
    y = torch.LongTensor(self.y_data[idx]).to(device)

    return x,y

In [64]:
class CustomDataset_test(Dataset):
  def __init__(self):
    
    self.x_data = X_test2
    self.y_data = [[y] for y in y_test2]

  def __len__(self):

    return len(self.x_data)

  def __getitem__(self, idx):

    x = torch.LongTensor(self.x_data[idx])
    y = torch.LongTensor(self.y_data[idx])

    return x,y

In [65]:
dataset = CustomDataset()
dataloader = DataLoader(dataset, batch_size=batch_size)

In [77]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.3):
      super(GRU, self).__init__()
      self.n_layers = n_layers
      self.hidden_dim = hidden_dim

      self.embed = nn.Embedding(n_vocab, embed_dim)
      self.dropout = nn.Dropout(dropout_p)
      self.gru = nn.GRU(embed_dim, self.hidden_dim,
                        num_layers=self.n_layers,
                        batch_first=True)
     # self.rnn = nn.RNN(embed_dim, self.hidden_dim,batch_first = True)
      self.out = nn.Sequential(
          nn.Linear(self.hidden_dim, n_classes),
          nn.Softmax()
      )
    def forward(self, x):
      x = self.embed(x)
      h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
      x, _ = self.gru(x, h_0)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
     # x, _ = self.rnn(x,h_0)
      h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
      self.dropout(h_t)
      logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
      return logit

    def _init_state(self, batch_size=1):
      weight = next(self.parameters()).data
      return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [73]:
n_vocab = 47257+1
embedd_size = 5
hidden_size = 100
output_size = 5

In [79]:
net = GRU(1, 256, n_vocab, embedd_size, output_size, 0.3).to(device)


RuntimeError: ignored

In [61]:
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(net.parameters(), lr)

In [62]:
losses = []
for epoch in range(epochs):
  
  for x, y in dataloader:
    optimizer.zero_grad()
    y = y.to(device)

    # forward 연산
    hypothesis = net(x)

    # 비용 함수
    y = y.squeeze()
    cost = criterion(hypothesis, y)
    cost.backward()
    optimizer.step()
    losses.append(cost.item()) # 값만 가져오기 위해서 .item()

  # 10의 배수에 해당되는 에포크마다 비용을 출력
  if epoch % 10 == 0:
      print(epoch, cost.item())

RuntimeError: ignored