In [10]:
import os
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import random

import pandas as pd
import numpy as np
import re

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(777)
if device == 'cuda':
  torch.cuda.manual_seed_all(777)

In [3]:
batch_size = 64
lr = 0.001
epochs = 50

In [4]:
train = pd.read_csv('/content/drive/MyDrive/235670_소설 작가 분류 AI 경진대회_data/train.csv', encoding = 'utf-8')
test = pd.read_csv('/content/drive/MyDrive/235670_소설 작가 분류 AI 경진대회_data/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('/content/drive/MyDrive/235670_소설 작가 분류 AI 경진대회_data/sample_submission.csv', encoding = 'utf-8')

In [5]:
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [6]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text']=train['text'].apply(alpha_num)

In [7]:
import nltk
from nltk.corpus import stopwords 

nltk.download('stopwords')
stopwords = stopwords.words('english')

print(stopwords[:10])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [8]:
!pip install konlpy
!sudo apt-get install curl git
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.3 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 42.8 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.3.0 konlpy-0.6.0
Reading package lists... Done
Building dependency tree       
Reading state information... Done
curl is already the newest version (7.58.0-2ubuntu3.16).
git is already the newest version (1:2.17.1-1ubuntu0.9).
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Installing automake (A dependency for mecab-ko)
Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidi

In [9]:
from konlpy.tag import Kkma, Komoran, Okt, Mecab

mec = Mecab()
okt = Okt()
kkm = Kkma()
kom = Komoran()

text = "쿠빅 콘테스트 이거 큰일이네요."
print(mec.morphs(text),'MECAB')
print(okt.morphs(text),'OKT')
print(kkm.morphs(text),'KKMA')
print(kom.morphs(text),'KOMORAN')

['쿠', '빅', '콘테스트', '이거', '큰일', '이', '네요', '.'] MECAB
['쿠빅', '콘테스트', '이', '거', '큰일', '이네', '요', '.'] OKT
['쿠빅', '콘테스트', '이거', '큰일', '이', '네요', '.'] KKMA
['쿠', '빅', '콘테스트', '이것', '큰일', '이', '네요', '.'] KOMORAN


MECAB보단 OKT 형태소 분석기가 우리가 받아들이기에 더 정확함을 볼 수 있었다. 

In [11]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

text = "쿠빅 콘테스트 이거 정말 큰일이네요."
tokenizer.tokenize(text)

['쿠빅', '콘테스트', '이거', '정말', '큰일이네요', '.']

konlpy 이외의 카카오톡에서 개발한 khaii api를 사용하여 형태소 분석을 해보자하였다.
https://github.com/kakao/khaiii - khaiii api 깃허브

In [12]:
!git clone https://github.com/kakao/khaiii.git
!pip install cmake
!mkdir build
!cd build && cmake /content/khaiii
!cd /content/build/ && make all
!cd /content/build/ && make resource
!cd /content/build && make install
!cd /content/build && make package_python
!pip install /content/build/package_python

Cloning into 'khaiii'...
remote: Enumerating objects: 1024, done.[K
remote: Counting objects:  12% (1/8)[Kremote: Counting objects:  25% (2/8)[Kremote: Counting objects:  37% (3/8)[Kremote: Counting objects:  50% (4/8)[Kremote: Counting objects:  62% (5/8)[Kremote: Counting objects:  75% (6/8)[Kremote: Counting objects:  87% (7/8)[Kremote: Counting objects: 100% (8/8)[Kremote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 1024 (delta 3), reused 0 (delta 0), pack-reused 1016[K
Receiving objects: 100% (1024/1024), 33.03 MiB | 25.70 MiB/s, done.
Resolving deltas: 100% (436/436), done.
-- [hunter] Initializing Hunter workspace (70287b1ffa810ee4e952052a9adff9b4856d0d54)
-- [hunter]   https://github.com/ruslo/hunter/archive/v0.23.34.tar.gz
-- [hunter]   -> /root/.hunter/_Base/Download/Hunter/0.23.34/70287b1
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for worki

In [14]:
from typing import *
from khaiii import KhaiiiApi
def get_token(text: str, tokenizer)-> List[Tuple]:
    
    if len(text)== 0 or text== ' ':  # 제목이 공백인 경우 tokenizer에러 발생
        return []
    
    result = tokenizer.analyze(text)
    result = [(morph.lex, morph.tag) for split in result for morph in split.morphs]  # (형태소, 품사) 튜플의 리스트
    return result

In [15]:
get_token(text,KhaiiiApi())

[('쿠빅', 'NNG'),
 ('콘테스트', 'NNG'),
 ('이거', 'NP'),
 ('정말', 'MAG'),
 ('크', 'VA'),
 ('ㄴ', 'ETM'),
 ('일', 'NNG'),
 ('이', 'VCP'),
 ('네요', 'EF'),
 ('.', 'SF')]

한글에 특화되어서 각각의 단어별 요소와 받침에 담긴 의미까지 분석하는 모습을 확인할 수 있다.

In [16]:
train['text'][0]

'He was almost choking There was so much so much he wanted to say but strange exclamations were all that came from his lips The Pole gazed fixedly at him at the bundle of notes in his hand looked at odin and was in evident perplexity'

In [17]:
get_token(train['text'][0], KhaiiiApi())

[('He', 'SL'),
 ('was', 'SL'),
 ('almost', 'SL'),
 ('choking', 'SL'),
 ('There', 'SL'),
 ('was', 'SL'),
 ('so', 'SL'),
 ('much', 'SL'),
 ('so', 'SL'),
 ('much', 'SL'),
 ('he', 'SL'),
 ('wanted', 'SL'),
 ('to', 'SL'),
 ('say', 'SL'),
 ('but', 'SL'),
 ('strange', 'SL'),
 ('exclamations', 'SL'),
 ('were', 'SL'),
 ('all', 'SL'),
 ('that', 'SL'),
 ('came', 'SL'),
 ('from', 'SL'),
 ('his', 'SL'),
 ('lips', 'SL'),
 ('The', 'SL'),
 ('Pole', 'SL'),
 ('gazed', 'SL'),
 ('fixedly', 'SL'),
 ('at', 'SL'),
 ('him', 'SL'),
 ('at', 'SL'),
 ('the', 'SL'),
 ('bundle', 'SL'),
 ('of', 'SL'),
 ('notes', 'SL'),
 ('in', 'SL'),
 ('his', 'SL'),
 ('hand', 'SL'),
 ('looked', 'SL'),
 ('at', 'SL'),
 ('odin', 'SL'),
 ('and', 'SL'),
 ('was', 'SL'),
 ('in', 'SL'),
 ('evident', 'SL'),
 ('perplexity', 'SL')]

하지만 영어는 토큰화가 뛰어쓰기를 통해 진행하여도 상관없기에 KhaiiiApi에 의한 성능의 향상은 없다

In [18]:
def remove_stopwords(text):
    final_text = []
    words = tokenizer.tokenize(text)
    for word in words:
        if word.strip().lower() not in stopwords:
          final_text.append(word.strip())
    return  " ".join(final_text)


train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [19]:
train

Unnamed: 0,index,text,author
0,0,almost choking much much wanted say strange ex...,3
1,1,sister asked suppose,2
2,2,engaged one day walked perusing janes last let...,1
3,3,captain porch keeping carefully way treacherou...,4
4,4,mercy gentlemen odin flung hands dont write an...,3
...,...,...,...
54874,54874,mr smith odin whispered hardly dared hope woul...,2
54875,54875,told plan captain us settled details accomplis...,4
54876,54876,sincere wellwisher friend sister lucy odin,1
54877,54877,wanted lend money,3


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
#v = TfidfVectorizer(max_features = 100) 
hv = HashingVectorizer(n_features = 100) #hashingvectorizer는 크기가 큰 데이터의 경우 해시 데이터 구조를 통해 더 빠르게 연산을 수행할 수 있도록 도와준다
hv.fit(train['text'])

HashingVectorizer(n_features=100)

In [21]:
x = hv.transform(train['text']).toarray()
print(x)

[[ 0.          0.          0.2236068  ...  0.          0.2236068
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.49319696 ...  0.         -0.16439899
   0.        ]
 ...
 [ 0.          0.          0.40824829 ...  0.          0.
   0.        ]
 [ 0.          0.         -0.57735027 ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [22]:
print(x.shape) 
print(train.shape)

(54879, 100)
(54879, 3)


In [23]:
x = pd.DataFrame(x)
temp = []
for i in range(len(x)):
  temp.append(list(x.iloc[i,:]))
train['preprocessed_text'] = temp

In [24]:
train

Unnamed: 0,index,text,author,preprocessed_text
0,0,almost choking much much wanted say strange ex...,3,"[0.0, 0.0, 0.22360679774997896, -0.22360679774..."
1,1,sister asked suppose,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,engaged one day walked perusing janes last let...,1,"[0.0, 0.0, 0.4931969619160719, 0.0, 0.0, 0.0, ..."
3,3,captain porch keeping carefully way treacherou...,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1924500..."
4,4,mercy gentlemen odin flung hands dont write an...,3,"[0.0, 0.23570226039551587, 0.23570226039551587..."
...,...,...,...,...
54874,54874,mr smith odin whispered hardly dared hope woul...,2,"[0.0, 0.0, 0.30151134457776363, 0.0, 0.0, 0.0,..."
54875,54875,told plan captain us settled details accomplis...,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3779644730092..."
54876,54876,sincere wellwisher friend sister lucy odin,1,"[0.0, 0.0, 0.4082482904638631, 0.0, 0.0, 0.0, ..."
54877,54877,wanted lend money,3,"[0.0, 0.0, -0.5773502691896258, 0.0, 0.0, 0.0,..."


In [27]:

word_set = []
max_len = 0

for d in train['text']:
  word_set = word_set + d.split(' ') # 여기에 토큰화한 데이터가 들어가면 됩니다
  if len(d.split()) > max_len:
    max_len = len(d.split())
  
word_set = set(word_set)

In [28]:
word_to_idx = {word: i+1 for i, word in enumerate(word_set)}
print(len(word_set))
print(max_len)

47120
212


In [29]:
def word_to_key(text):
  final_text = []
  for word in text.split():
      final_text.append(word_to_idx[word])
  if len(final_text) < max_len:
    final_text = final_text + [0] * (max_len - len(final_text))
  return final_text


train['word_to_key'] = train['text'].apply(word_to_key)

In [30]:
X_train = train.iloc[:45000, 4]
X_test = train.iloc[45000:, 4].reset_index(drop=True)

y_train = train.iloc[:45000, 2]
y_test = train.iloc[45000:, 2].reset_index(drop=True)


In [31]:
class CustomDataset(Dataset):
  def __init__(self):
    
    self.x_data = X_train
    self.y_data = [[y] for y in y_train]

  def __len__(self):

    return len(self.x_data)

  def __getitem__(self, idx):

    x = torch.LongTensor(self.x_data[idx]).to(device)
    y = torch.LongTensor(self.y_data[idx]).to(device)

    return x,y

In [32]:
dataset = CustomDataset()
dataloader = DataLoader(dataset, batch_size=batch_size)

내 발표 범위가 gru 이기 떄문에 gru로 함 진행해보았다~

In [33]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
      super(GRU, self).__init__()
      self.n_layers = n_layers
      self.hidden_dim = hidden_dim

      self.embed = nn.Embedding(n_vocab, embed_dim)
      self.dropout = nn.Dropout(dropout_p)
      self.gru = nn.GRU(embed_dim, self.hidden_dim,
                        num_layers=self.n_layers,
                        batch_first=True)
      #self.rnn = nn.RNN(embed_dim, self.hidden_dim,batch_first = True)
      self.out = nn.Sequential(
          nn.Linear(self.hidden_dim, n_classes),
          nn.Softmax()
      )
    def forward(self, x):
      x = self.embed(x)
      h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
      x, _ = self.gru(x, h_0)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
      #x, _ = self.rnn(x,h_0)
      h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
      self.dropout(h_t)
      logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
      return logit

    def _init_state(self, batch_size=1):
      weight = next(self.parameters()).data
      return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [34]:
n_vocab = 47120+1
embedd_size = 5
hidden_size = 100
output_size = 5

In [35]:
net =GRU(1, 256, n_vocab, embedd_size, output_size, 0.5).to(device)

In [36]:
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(net.parameters(), lr)

In [37]:
losses = []
for epoch in range(50):
  
  for x, y in dataloader:
    optimizer.zero_grad()
    y = y.to(device)

    # forward 연산
    hypothesis = net(x)

    # 비용 함수
    y = y.squeeze()
    cost = criterion(hypothesis, y)
    cost.backward()
    optimizer.step()
    losses.append(cost.item()) # 값만 가져오기 위해서 .item()

  # 10의 배수에 해당되는 에포크마다 비용을 출력
  if epoch % 10 == 0:
      print(epoch, cost.item())

  input = module(input)


0 1.5179630517959595
10 1.5096793174743652
20 1.5076931715011597
30 1.3523322343826294
40 1.1983028650283813


In [38]:
class CustomDataset_test(Dataset):
  def __init__(self):
    
    self.x_data = X_test
    self.y_data = [[y] for y in y_test]

  def __len__(self):

    return len(self.x_data)

  def __getitem__(self, idx):

    x = torch.LongTensor(self.x_data[idx])
    y = torch.LongTensor(self.y_data[idx])

    return x,y

In [39]:
dataset = CustomDataset_test()
test_loader = DataLoader(dataset, batch_size=batch_size)

In [40]:
correct = 0

with torch.no_grad():
  net = net.to('cpu')
  net.eval()
  for data, target in test_loader:
    data, target = data, target
    output = net(data)
    
    pred = output.max(1, keepdim=True)[1]
    # eq() 함수는 값이 일치하면 1을, 아니면 0을 출력.
    correct += pred.eq(target.view_as(pred)).sum().item()

test_accuracy = correct / len(test_loader.dataset)
print('Accuracy:', test_accuracy)

  input = module(input)


Accuracy: 0.3531733981172183
