In [None]:
# konlpy Mecab 사용하기

!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

In [None]:
!pip install glove_python

In [2]:
# 내 드라이브에 대한 주소
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
path = "/gdrive/My Drive/dacon_news"

train = pd.read_csv(path + "/data/news_train.csv")
test = pd.read_csv(path + "/data/news_test.csv")

# 학습시킬 말뭉치 준비

In [62]:
from konlpy.tag import Mecab
import re
from konlpy.tag import Okt

def text_preprocessing(text_list):
    
    stopwords = ['이']
    tokenizer = Mecab() #형태소 분석기

    token_list = [] 
    
    for text in text_list:
        txt = re.sub("[a-zA-Z0-9]", ' ', text) #영문 제거
        txt = re.sub('[가-힣]+\s기자','기자', txt) #기자 이름 제거
        token = tokenizer.morphs(txt) #형태소 분석

        token = [t for t in token if t not in stopwords] 
        token_list.append(token)
        
    return token_list, tokenizer

#형태소 분석기를 따로 저장한 이유는 후에 test 데이터 전처리를 진행할 때 이용해야 되기 때문입니다.
embedding_train = pd.concat([train["content"],test["content"]],axis=0)
embedding_train, okt = text_preprocessing(embedding_train)

# Glove


In [38]:
from glove import Corpus, Glove

corpus = Corpus() 
corpus.fit(embedding_train, window=10)
# 훈련 데이터로부터 GloVe에서 사용할 동시 등장 행렬 생성

glove = Glove(no_components=300, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=5, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

Performing 5 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


# Word2Vec

In [6]:
from gensim.models.word2vec import Word2Vec

In [66]:
embedding_model = Word2Vec(embedding_train,         # 리스트 형태의 데이터
                 sg=1,          # 0: CBOW, 1: Skip-gram
                 size = 300,      # 벡터 크기(300)
                 window = 10,     # 고려할 앞뒤 폭(앞뒤 10단어)
                 min_count = 30,  # 사용할 단어의 최소 빈도(30회 이하 단어 무시)
                 workers = -1,    # 동시에 처리할 작업 수(코어 수와 비슷하게 설정)
                 iter = 30)    

In [67]:
embedding_model.save("embedding_300_10_30_iter_30.model")

In [49]:
embedding_model.most_similar("]")

  """Entry point for launching an IPython kernel.


[('하원', 0.19800397753715515),
 ('경매', 0.1971077024936676),
 ('현충원', 0.19422751665115356),
 ('연구자', 0.18848800659179688),
 ('벌어졌', 0.18307097256183624),
 ('그대로', 0.18146532773971558),
 ('구역', 0.17465075850486755),
 ('팬', 0.1736888587474823),
 ('사위', 0.17347624897956848),
 ('이끌', 0.17254045605659485)]

In [None]:
embedding_model.most_similar('카톡')

  """Entry point for launching an IPython kernel.


[('방사광', 0.2288285493850708),
 ('폰', 0.21296778321266174),
 ('대원', 0.210852712392807),
 ('노리', 0.20738786458969116),
 ('분양', 0.20020851492881775),
 ('봉', 0.19835519790649414),
 ('형사', 0.19768759608268738),
 ('마켓', 0.19744479656219482),
 ('기원', 0.19487667083740234),
 ('푸르', 0.19428278505802155)]

embedding_300_10_30_all : 영문, 숫자, '이', 기자 이름 제거  
embedding_300_10_30_final : 영문, 숫자만 제거  
embedding_300_10_30_all_new : 영문, 숫자만 제거, 기자 이름 제거 X

In [None]:
class Full_Lag_RNN(keras.Model):
  def __init__(self,units = 30, max_len = 50, activation = "relu", **kwargs):
    super().__init__(**kwargs)
    self.max_len = max_len
    self.hidden1 = keras.layers.SimpleRNN(units, activation = "selu")
    self.hidden2 = keras.layers.SimpleRNN(units, activation = "selu")

    self.output = keras.layers.Dense(1, activatino = "signoid")

  def call(self, inputs):
    input_ = inputs
    hidden1 = self.hidden1(input_)
    hidden2 = self.hidden2(hidden1)
    output = self.output(hidden2)
    return output