In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 라이브러리 import/기본 설정

In [2]:
import os
import torch 
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import random

import pandas as pd
import re

In [3]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

# 데이터 불러오기

In [4]:
#파일 불러오기
train = pd.read_csv('/content/drive/MyDrive/235670_소설 작가 분류 AI 경진대회_data/train.csv', encoding = 'utf-8') #한글의 경우 encoding으로 utf-8, ms949,cp949
test = pd.read_csv('/content/drive/MyDrive/235670_소설 작가 분류 AI 경진대회_data/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('/content/drive/MyDrive/235670_소설 작가 분류 AI 경진대회_data/sample_submission.csv', encoding = 'utf-8')

In [5]:
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [6]:
pd.DataFrame(sample_submission.iloc[0,:]) # 5명의 작가(target var)

Unnamed: 0,0
index,0
0,0
1,0
2,0
3,0
4,0


#전처리

In [7]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text']=train['text'].apply(alpha_num)
train

Unnamed: 0,index,text,author
0,0,He was almost choking There was so much so muc...,3
1,1,Your sister asked for it I suppose,2
2,2,She was engaged one day as she walked in peru...,1
3,3,The captain was in the porch keeping himself c...,4
4,4,Have mercy gentlemen odin flung up his hands D...,3
...,...,...,...
54874,54874,Is that you Mr Smith odin whispered I hardly d...,2
54875,54875,I told my plan to the captain and between us w...,4
54876,54876,Your sincere wellwisher friend and sister LUC...,1
54877,54877,Then you wanted me to lend you money,3


불용어 제거하기



In [8]:
import nltk
from nltk.corpus import stopwords 

nltk.download('stopwords')
stopwords = stopwords.words('english')

len(stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


179

In [9]:
!pip install konlpy
!sudo apt-get install curl git
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.3 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 39.6 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.3.0 konlpy-0.6.0
Reading package lists... Done
Building dependency tree       
Reading state information... Done
curl is already the newest version (7.58.0-2ubuntu3.16).
git is already the newest version (1:2.17.1-1ubuntu0.9).
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Installing automake (A dependency for mecab-ko)
Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bio

### 토큰화

In [10]:
from nltk.tokenize import WordPunctTokenizer   #다른 토크나이저 사용
tokenizer = WordPunctTokenizer ()


In [11]:
def remove_stopwords(text):
    final_text = []
    words = tokenizer.tokenize(text)
    for word in words:
        if word.strip().lower() not in stopwords:
          final_text.append(word.strip())
    return  " ".join(final_text)


train['text'] = train['text'].str.lower() #소문자로
test['text'] = test['text'].str.lower()   #소문자로
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [12]:
train

Unnamed: 0,index,text,author
0,0,almost choking much much wanted say strange ex...,3
1,1,sister asked suppose,2
2,2,engaged one day walked perusing janes last let...,1
3,3,captain porch keeping carefully way treacherou...,4
4,4,mercy gentlemen odin flung hands dont write an...,3
...,...,...,...
54874,54874,mr smith odin whispered hardly dared hope woul...,2
54875,54875,told plan captain us settled details accomplis...,4
54876,54876,sincere wellwisher friend sister lucy odin,1
54877,54877,wanted lend money,3


In [13]:
test

Unnamed: 0,index,text
0,0,think one charming young ladies ever met might...
1,1,replied sudden consciousness find cannot ignor...
2,2,lady stated intention screaming course would s...
3,3,suddenly silence heard sound sent heart mouth ...
4,4,conviction remained unchanged far knowand beli...
...,...,...
19612,19612,end another day two odin growing visibly stron...
19613,19613,afternoon sat together mostly silence watching...
19614,19614,odin carried thanks odin proceeded happiness l...
19615,19615,soon upon odins leaving room mama said odin al...


4-1 . 카운터 기반 임베딩

컴퓨터가 알아들을 수 있도록 문자를 벡터로 바꾸어줌

- 원핫 인코딩
간단한 방법, 그러나 매우 sparse 한 행렬 -> inefficient

- 카운트 기반 인코딩(BoG 등): tf-idf

여기서는 tf-idf 이용

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer(max_features = 200) # max_features를 통해 최대 몇 개의 단어를 벡터로 바꿀 것인지 결정합니다. 참고 max_features 설정 안 했더니 제 커널은 죽었어요ㅠㅠ
v.fit(train['text']) # test 때는 train 에서 학습된 tf-idf를 이용해야 하기 때문에 여기서는 fit_transform을 한꺼번에 쓰는 대신, 나눠서 이용합니다 ~ sklearn에서 여타 모델들과 마찬가지로 생각하시면 됩니다

TfidfVectorizer(max_features=200)

In [15]:
print(v.vocabulary_) # 벡터화된 단어들 사전

{'almost': 0, 'much': 113, 'say': 147, 'came': 18, 'hand': 63, 'looked': 92, 'odin': 121, 'asked': 8, 'one': 125, 'day': 26, 'last': 79, 'letter': 85, 'mr': 111, 'saw': 146, 'looking': 93, 'away': 9, 'said': 144, 'way': 185, 'turned': 178, 'us': 182, 'take': 164, 'side': 153, 'hands': 64, 'men': 103, 'dont': 31, 'heart': 68, 'oh': 123, 'god': 56, 'well': 186, 'another': 4, 'time': 174, 'man': 98, 'little': 89, 'yet': 198, 'see': 148, 'great': 61, 'upon': 181, 'would': 194, 'two': 179, 'could': 23, 'put': 134, 'though': 171, 'night': 119, 'must': 114, 'ask': 7, 'cried': 25, 'may': 101, 'years': 196, 'always': 2, 'house': 72, 'new': 117, 'began': 11, 'things': 169, 'knew': 76, 'want': 184, 'next': 118, 'morning': 109, 'moment': 107, 'think': 170, 'end': 34, 'thought': 172, 'quite': 136, 'still': 159, 'ever': 38, 'table': 163, 'passed': 128, 'already': 1, 'indeed': 75, 'come': 22, 'back': 10, 'took': 177, 'long': 90, 'returned': 140, 'many': 99, 'old': 124, 'really': 138, 'even': 36, 'fel

In [16]:
x = v.transform(train['text']).toarray()
print(x)

[[0.39448743 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.5468209  0.         0.        ]]


In [17]:
print(x.shape) # 200개의 max 단어를 설정했기 때문에 문장수, 단어수 이렇게 array를 만들어진 것을 확인할 수 있다
print(train.shape)

(54879, 200)
(54879, 3)


In [18]:
x = pd.DataFrame(x)
temp = []
for i in range(len(x)):
  temp.append(list(x.iloc[i,:]))
train['preprocessed_text'] = temp

In [19]:
len(temp[5])

200

In [20]:
train

Unnamed: 0,index,text,author,preprocessed_text
0,0,almost choking much much wanted say strange ex...,3,"[0.3944874313763683, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,sister asked suppose,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
2,2,engaged one day walked perusing janes last let...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,captain porch keeping carefully way treacherou...,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,mercy gentlemen odin flung hands dont write an...,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
54874,54874,mr smith odin whispered hardly dared hope woul...,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
54875,54875,told plan captain us settled details accomplis...,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
54876,54876,sincere wellwisher friend sister lucy odin,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
54877,54877,wanted lend money,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


워드 임베딩

In [21]:
train['text'][0].split()

['almost',
 'choking',
 'much',
 'much',
 'wanted',
 'say',
 'strange',
 'exclamations',
 'came',
 'lips',
 'pole',
 'gazed',
 'fixedly',
 'bundle',
 'notes',
 'hand',
 'looked',
 'odin',
 'evident',
 'perplexity']

In [22]:
word_set = []
max_len = 0

for d in train['text']:
  word_set = word_set + d.split(' ') # 여기에 토큰화한 데이터가 들어가면 됩니다
  if len(d.split()) > max_len:
    max_len = len(d.split())
  
word_set = set(word_set)



In [23]:
word_to_idx = {word: i+1 for i, word in enumerate(word_set)}
print(len(word_set))
print(max_len)

47121
212


In [24]:
def word_to_key(text):
  final_text = []
  for word in text.split():
      final_text.append(word_to_idx[word])
  if len(final_text) < max_len:
    final_text = final_text + [0] * (max_len - len(final_text))
  return final_text


train['word_to_key'] = train['text'].apply(word_to_key)

In [25]:
train

Unnamed: 0,index,text,author,preprocessed_text,word_to_key
0,0,almost choking much much wanted say strange ex...,3,"[0.3944874313763683, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[28230, 11428, 38330, 38330, 11299, 1882, 4185..."
1,1,sister asked suppose,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[19728, 25229, 44495, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,engaged one day walked perusing janes last let...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2660, 1344, 23079, 37070, 5034, 22636, 36822,..."
3,3,captain porch keeping carefully way treacherou...,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[5242, 8111, 1220, 12147, 37813, 44799, 37098,..."
4,4,mercy gentlemen odin flung hands dont write an...,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4911, 9076, 26493, 4853, 42184, 39580, 2325, ..."
...,...,...,...,...,...
54874,54874,mr smith odin whispered hardly dared hope woul...,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[8553, 30156, 26493, 8775, 26273, 25899, 21749..."
54875,54875,told plan captain us settled details accomplis...,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[5805, 44163, 5242, 39192, 37689, 41435, 5569,..."
54876,54876,sincere wellwisher friend sister lucy odin,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[5671, 46504, 28041, 19728, 9504, 26493, 0, 0,..."
54877,54877,wanted lend money,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[11299, 28244, 19100, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [26]:
X_train = train.iloc[:45000, 4]
X_test = train.iloc[45000:, 4].reset_index(drop=True)

y_train = train.iloc[:45000, 2]
y_test = train.iloc[45000:, 2].reset_index(drop=True)

In [27]:
#학습이 오래걸려서 하이퍼 파라미터를 변경

batch_size = 64
lr = 0.001
epochs = 30

In [28]:
class CustomDataset(Dataset):
  def __init__(self):
    
    self.x_data = X_train
    self.y_data = [[y] for y in y_train]

  def __len__(self):

    return len(self.x_data)

  def __getitem__(self, idx):

    x = torch.LongTensor(self.x_data[idx]).to(device)
    y = torch.LongTensor(self.y_data[idx]).to(device)

    return x,y

In [29]:
class CustomDataset_test(Dataset):
  def __init__(self):
    
    self.x_data = X_test
    self.y_data = [[y] for y in y_test]

  def __len__(self):

    return len(self.x_data)

  def __getitem__(self, idx):

    x = torch.LongTensor(self.x_data[idx])
    y = torch.LongTensor(self.y_data[idx])

    return x,y

In [30]:
dataset = CustomDataset()
dataloader = DataLoader(dataset, batch_size=batch_size) #sampler를 만들었다면, sampler를 파라미터로 넣어줄 수 있음 : https://hyelimkungkung.tistory.com/29?category=935193 참고 ㅎㅎ

In [31]:
class RNN(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
      super(RNN, self).__init__()
      self.n_layers = n_layers
      self.hidden_dim = hidden_dim

      self.embed = nn.Embedding(n_vocab, embed_dim)
      self.dropout = nn.Dropout(dropout_p)
      # self.gru = nn.GRU(embed_dim, self.hidden_dim,
      #                  num_layers=self.n_layers,
      #                  batch_first=True)
      self.rnn = nn.RNN(embed_dim, self.hidden_dim,batch_first = True)
      self.out = nn.Sequential(
          nn.Linear(self.hidden_dim, n_classes),
          nn.Softmax()
      )
    def forward(self, x):
      x = self.embed(x)
      h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
      #x, _ = self.gru(x, h_0)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
      x, _ = self.rnn(x,h_0)
      h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
      self.dropout(h_t)
      logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
      return logit

    def _init_state(self, batch_size=1):
      weight = next(self.parameters()).data
      return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [32]:
n_vocab = 47120+1
embedd_size = 5
hidden_size = 100
output_size = 5

In [33]:
net = RNN(1, 256, n_vocab, embedd_size, output_size, 0.5).to(device)   

In [34]:
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(net.parameters(), lr)

In [36]:
losses = []
for epoch in range(epochs):
  
  for x, y in dataloader:
    optimizer.zero_grad()
    y = y.to(device)

    # forward 연산
    hypothesis = net(x)

    # 비용 함수
    y = y.squeeze()
    cost = criterion(hypothesis, y)
    cost.backward()
    optimizer.step()
    losses.append(cost.item()) # 값만 가져오기 위해서 .item()

  # 10의 배수에 해당되는 에포크마다 비용을 출력
  if epoch % 10 == 0:
      print(epoch, cost.item())

RuntimeError: ignored

In [None]:
dataset = CustomDataset_test()
test_loader = DataLoader(dataset, batch_size=batch_size)

In [None]:
correct = 0

with torch.no_grad():
  net = net.to('cpu')
  net.eval()
  for data, target in test_loader:
    data, target = data, target
    output = net(data)
    
    pred = output.max(1, keepdim=True)[1]
    # eq() 함수는 값이 일치하면 1을, 아니면 0을 출력.
    correct += pred.eq(target.view_as(pred)).sum().item()

test_accuracy = correct / len(test_loader.dataset)
print('Accuracy:', test_accuracy)

colab으로 올려주신 파일 먼저 돌려보고 과제하려고 했는데 구글에서 gpu사용량을 초과했다고 해서 다른계정으로 작업을 시도했습니다.. 그런데 계속 gpu 이슈때문에 생기는 error를 해결 못했고 cpu를 사용하자니 학습시간이 너무 오래걸려서 과제를 제대로 하지 못했습니다 죄송합니다