In [2]:
!pip install konlpy
!pip install gensim==3.6



In [3]:
import csv
import json
import os

from konlpy.tag import Okt

import numpy as np
from numpy import array
from numpy import zeros

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from gensim.models import FastText
from gensim.test.utils import datapath

import chakin

Using TensorFlow backend.


In [4]:
KAGGLE_TEST = False #True, when making output with Kaggle test data

KAGGLE_TEST_BASE_DIR = '/content/drive/Shared drives/NaturalLanguageProcessing2/korean/test/'
KAGGLE_TEST_DATA_DIR = 'data/'

CSV_TEST_FILENAME = 'ko_data.csv'
TXT_TEST_FILENAME = 'kaggle_test.txt'

BASE_DIR = '/content/drive/Shared drives/NaturalLanguageProcessing2/korean/'
DATA_DIR = 'data/'

INPUT_LENGTH = 32

EMBEDDING_SIZE = 300

USE_PRETRAINED_FASTTEXT = False

#MODEL_SELECTION : 0 = LSTM, 1 = neural network, 2 = CNN
MODEL_SELECTION = 1

In [5]:
#making TXT_TEST_FILENAME
if KAGGLE_TEST:
  with open(KAGGLE_TEST_BASE_DIR + KAGGLE_TEST_DATA_DIR + CSV_TEST_FILENAME, 'r', encoding='cp949') as csv_file:
      with open(KAGGLE_TEST_BASE_DIR + TXT_TEST_FILENAME, 'w', encoding='utf-8') as txt_file:
          while True: 
              line = csv_file.readline() 
              #print(line)
              if not line: 
                  break 
              txt_file.write(line)

In [6]:
#reading Kaggle test text file
if KAGGLE_TEST:
  def readKaggleFile(file_name):
      with open(file_name, 'r', encoding='utf-8') as file:
          data = [line.split(',') for line in file.read().splitlines(keepends=False)]
          data = data[1:]
      return data

  test_text = readKaggleFile(KAGGLE_TEST_BASE_DIR + TXT_TEST_FILENAME)

  #tokenize test text file
  okt = Okt()

  def tokenize(doc, okt):
      return [token for token, _ in okt.pos(doc, norm=True, stem=True)]

  test_docs = [(tokenize(row[1], okt)) for row in test_text]

In [7]:
def readFile(file_name):
    with open(file_name, 'r') as file:
        data = [line.split('\t') for line in file.read().splitlines(keepends=False)]
        data = data[1:]
    return data

In [8]:
#reading train, test text file
train_text = readFile(BASE_DIR+DATA_DIR+'ratings_train.txt')
if not KAGGLE_TEST:
  test_text = readFile(BASE_DIR+DATA_DIR+'ratings_test.txt')

In [9]:
for i in range(5):
  print(train_text[i][1])

아 더빙.. 진짜 짜증나네요 목소리
흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
너무재밓었다그래서보는것을추천한다
교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다


In [10]:
def tokenize_tag(doc, okt):
    return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=True)]

def tokenize(doc, okt):
    return [token for token, _ in okt.pos(doc, norm=True, stem=True)]

In [11]:
#tokenize train/test text file, if tokenized train/text doc file already exist, load them
okt = Okt()
if os.path.isfile(BASE_DIR+DATA_DIR+'train_docs.json') and os.path.isfile(BASE_DIR+DATA_DIR+'test_docs.json'):
    with open(BASE_DIR+DATA_DIR+'train_docs.json') as f:
        train_docs = json.load(f)
    if not KAGGLE_TEST:
        with open(BASE_DIR+DATA_DIR+'test_docs.json') as f:
            test_docs = json.load(f)
else:
    train_docs = [(tokenize(row[1], okt), row[2]) for row in train_text]
    if not KAGGLE_TEST:
        test_docs = [(tokenize(row[1], okt), row[2]) for row in test_text]
    with open(BASE_DIR+DATA_DIR+'train_docs.json', 'w', encoding="utf-8") as make_file:
        json.dump(train_docs, make_file, ensure_ascii=False, indent="\t")
    if not KAGGLE_TEST:
        with open(BASE_DIR+DATA_DIR+'test_docs.json', 'w', encoding="utf-8") as make_file:
            json.dump(test_docs, make_file, ensure_ascii=False, indent="\t")

In [12]:
#train_docs to train_sentences and train_target
train_sentences=[]
train_target=[]
for i in range(len(train_docs)):
    train_sentences.append(train_docs[i][0])
    train_target.append(train_docs[i][1])

In [13]:
#removing stopwords from train_sentences
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
for i in range(len(train_sentences)):
    train_sentences[i] = [word for word in train_sentences[i] if not word in stopwords] # 불용어 제거
print(train_sentences[:10])

[['아', '더빙', '..', '진짜', '짜증나다', '목소리'], ['흠', '...', '포스터', '보고', '초딩', '영화', '줄', '....', '오버', '연기', '조차', '가볍다', '않다'], ['너', '무재', '밓었', '다그', '래서', '보다', '추천', '다'], ['교도소', '이야기', '구먼', '..', '솔직하다', '재미', '없다', '..', '평점', '조정'], ['사이', '몬페', '그', '익살스럽다', '연기', '돋보이다', '영화', '!', '스파이더맨', '에서', '늙다', '보이다', '커스틴', '던스트', '너무나도', '이쁘다', '보이다'], ['막', '걸음', '마', '떼다', '3', '세', '부터', '초등학교', '1', '학년', '생인', '8', '살다', '영화', '.', 'ㅋㅋㅋ', '...', '별', '반개', '아깝다', '움', '.'], ['원작', '긴장감', '을', '제대로', '살리다', '.'], ['별', '반개', '아깝다', '욕', '나오다', '이응경', '길용우', '연', '기', '생활', '몇', '년', '인지', '..', '정말', '발', '로', '해도', '그것', '보단', '낫다', '납치', '.', '감금', '만', '반복', '반복', '..', '드라마', '가족', '없다', '연기', '못', '사람', '만', '모', '엿', '네'], ['액션', '없다', '재미', '있다', '몇', '안되다', '영화'], ['왜', '이렇게', '평점', '낮다', '?', '꽤', '볼', '만', '데', '..', '헐리우드', '식', '화려하다', '너무', '길들이다', '있다', '?']]


In [14]:
# define documents
docs = train_sentences
# define class labels
labels = array(train_target)
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
# pad documents to a max length of INPUT_LENGTH words
max_length = INPUT_LENGTH
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [15]:
#FastText model
if USE_PRETRAINED_FASTTEXT: #load pre-trained fasttext
    try:
        ko_model = FastText.load_fasttext_format(datapath(BASE_DIR + DATA_DIR + 'wiki.ko.bin'))
    except:
        ko_model = FastText.load_fasttext_format(datapath(BASE_DIR + DATA_DIR + 'cc.ko.300.bin'))
    EMBEDDING_SIZE = 300
else: #make fasttext model and train with the training data
    ko_model = FastText(size=EMBEDDING_SIZE, window=3, min_count=1)
    ko_model.build_vocab(sentences=train_sentences)
    ko_model.train(sentences=train_sentences, total_examples=len(train_sentences), epochs=10)  # train

In [16]:
# create a weight matrix for words in training docs
embeddings_index = ko_model.wv

embedding_matrix = zeros((vocab_size, EMBEDDING_SIZE))
for word, i in t.word_index.items():
  try:
    embedding_vector = embeddings_index[word]
  except:
    continue
  embedding_matrix[i] = embedding_vector

In [17]:
# get test_sentences
if not KAGGLE_TEST:
    test_sentences=[]
    test_target=[]
    for i in range(len(test_docs)):
        test_sentences.append(test_docs[i][0])
        test_target.append(test_docs[i][1])
else:
    test_sentences=[]
    for i in range(len(test_docs)):
        test_sentences.append(test_docs[i])

In [18]:
#delete stopwords on test sentences
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
for i in range(len(test_sentences)):
    test_sentences[i] = [word for word in test_sentences[i] if not word in stopwords]

In [19]:
# define documents
test_docs = test_sentences

# define class labels
if not KAGGLE_TEST:
    test_labels = array(test_target)

# integer encode the documents
test_encoded_docs = t.texts_to_sequences(test_docs)

# pad documents to a max length of 10 words
test_max_length = INPUT_LENGTH
test_padded_docs = pad_sequences(test_encoded_docs, maxlen=test_max_length, padding='post')

In [20]:
# CNN model
if MODEL_SELECTION == 2:
    from keras import optimizers
    from keras import backend as K
    from keras import regularizers
    from keras.models import Sequential
    from keras.layers import Dense, Activation, Dropout, Flatten
    from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
    from keras.utils import plot_model
    from keras.preprocessing import sequence
    from keras.preprocessing.text import Tokenizer
    from keras.callbacks import EarlyStopping

    #training params
    BATCH_SIZE = 256 
    EPOCHS = 20

    #model parameters
    num_filters = 64 
    embed_dim = 300 
    weight_decay = 1e-4

    model = Sequential()
    model.add(Embedding(vocab_size, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=INPUT_LENGTH, trainable=False))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Dense(1, activation='sigmoid'))  #binary

    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    model.summary()

In [21]:
# LSTM model
if MODEL_SELECTION == 0:
    from keras import optimizers
    from keras import backend as K
    from keras import regularizers
    from keras.models import Sequential
    from keras.layers import Dense, Activation, Dropout, Flatten
    from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
    from keras.layers import LSTM ##
    from keras.utils import plot_model
    from keras.preprocessing import sequence
    from keras.preprocessing.text import Tokenizer
    from keras.callbacks import EarlyStopping

    #training params
    BATCH_SIZE = 256 
    EPOCHS = 20

    #model parameters
    num_filters = 128
    embed_dim = 300 
    weight_decay = 1e-4

    model = Sequential()
    model.add(Embedding(vocab_size, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=INPUT_LENGTH, trainable=False))
    #model.add(Embedding(vocab_size, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=INPUT_LENGTH, trainable=True))
    model.add(LSTM(num_filters, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.5))
    #model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Dense(1, activation='sigmoid'))

    #adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    adam = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    model.summary()

In [22]:
# neural network model
if MODEL_SELECTION == 1:
    from keras import optimizers
    from keras import backend as K
    from keras import regularizers
    from keras.models import Sequential
    from keras.layers import Dense, Activation, Dropout, Flatten
    from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
    from keras.layers import LSTM ##
    from keras.utils import plot_model
    from keras.preprocessing import sequence
    from keras.preprocessing.text import Tokenizer
    from keras.callbacks import EarlyStopping

    #training params
    BATCH_SIZE = 256 
    EPOCHS = 20

    #model parameters
    num_filters = 128
    embed_dim = 300 
    weight_decay = 1e-4

    model = Sequential()
    model.add(Embedding(vocab_size, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=INPUT_LENGTH, trainable=False))
    #model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    #adam = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 300)           14632200  
_________________________________________________________________
dense_1 (Dense)              (None, 32, 64)            19264     
_________________________________________________________________
dense_2 (Dense)              (None, 32, 64)            4160      
_________________________________________________________________
flatten_1 (Flatten)          (None, 2048)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 2049      
Total params: 14,657,673
Trainable params: 25,473
Non-trainable params: 14,632,200
_________________________________________________________________


In [23]:
#define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

hist = model.fit(padded_docs, labels, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=1)
print(hist)

Train on 135000 samples, validate on 15000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 00006: early stopping
<keras.callbacks.callbacks.History object at 0x7f339c426e48>


In [24]:
if not KAGGLE_TEST:
    loss, accuracy = model.evaluate(test_padded_docs, test_labels, batch_size=BATCH_SIZE, verbose=1)
    print('Accuracy: %f' % (accuracy*100))

Accuracy: 83.260000


In [25]:
if KAGGLE_TEST:
    pred = model.predict(test_padded_docs)
    pred = pred.flatten()
    for i in range(len(pred)):
        if pred[i]>0.5:
            pred[i] = 1
        else:
            pred[i] = 0
    pred = pred.astype(int)

    import csv    

    OUTPUT_FILENAME = 'output.csv'

    def make_test_output(prediction_list):
        f = open(KAGGLE_TEST_BASE_DIR + OUTPUT_FILENAME, 'w', encoding='utf-8', newline='')
        wr = csv.writer(f)
        wr.writerow(['Id', 'Predicted'])
        id=0
        for pred in prediction_list:
            wr.writerow([id,pred])
            id+=1

    make_test_output(pred)