# title을 예측하는 model 생성

In [None]:
# 내 드라이브에 대한 주소
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
# konlpy Mecab 사용하기

!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
path = "/gdrive/My Drive/"

train = pd.read_csv(path+"news_train.csv")
#test = pd.read_csv(path+"news_test.csv")
submission = pd.read_csv(path + "sample_submission.csv")

In [None]:
print(train.shape)
#print(test.shape)

(118745, 6)


# 텍스트 전처리

In [None]:
from konlpy.tag import Mecab
import re
from konlpy.tag import Okt

"""'로','으로', '다', '했', '에', '의', '에서', '부터', '아', '하','고','도','것','그','으로','해진
['을', '를', '이', '가', '은', '는', 'null','부터','에','에서','하','고','으로','로','의','만','하','고','도','았','었','다'"""

def text_preprocessing(text_list):
    
    stopwords = [] #불용어 설정
    
    tokenizer = Mecab() #형태소 분석기 
    token_list = [] 
    
    for text in text_list:
        txt = re.sub('[^가-힣]', ' ', text) #한글, 영어만 남기고 다른 글자 모두 제거
        txt = re.sub('[가-힣\s]+기자]','기자', txt) #기자 이름 제거
        token = tokenizer.morphs(txt) #형태소 분석

        #형태소 분석 결과 중 stopwords에 해당하지 않고, float type이 아닌 것만 수집
        token = [t for t in token] 
        token_list.append(token)
        
    return token_list, tokenizer

#형태소 분석기를 따로 저장한 이유는 후에 test 데이터 전처리를 진행할 때 이용해야 되기 때문입니다. 
train['new_article'], mecab = text_preprocessing(train['content'])
#title도 동일하게 진행
train['new_title'], title_mecab = text_preprocessing(train['title'])

In [None]:
# 결측치 제거
train = train[train["new_article"].apply(lambda x: False if len(x)==0 else True)]

# Vectorization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 40
title_max_len = 15

def text2sequence(train_text, max_len=100):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_text)
    train_X_seq = tokenizer.texts_to_sequences(train_text)
    vocab_size = len(tokenizer.word_index) + 1
    print('vocab_size : ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen = max_len, truncating="post") # 길이를 맞춰줌
    return X_train, vocab_size, tokenizer

train_y = train['info']
train_X, vocab_size, vectorizer = text2sequence(train['new_article'], max_len = max_len)
title_X, title_vocab_size, title_vectorizer = text2sequence(train['new_title'], max_len = title_max_len)

print(train_X.shape, train_y.shape)

vocab_size :  33461
vocab_size :  7926
(118414, 40) (118414,)


# word2vec

In [None]:
import gensim
from gensim.models.keyedvectors import KeyedVectors
path = "/gdrive/My Drive/"

In [None]:
# 한국어 word2vec model
word2vec = gensim.models.Word2Vec.load(path+'ko.bin')
embedding_size = 200

In [None]:
embedding_matrix = np.zeros((vocab_size, 200))
title_embedding_matrix = np.zeros((title_vocab_size, 200))
vocab = vectorizer.word_index
title_vocab = title_vectorizer.word_index
count = 0

for idx, word in enumerate(vocab):
    if word in word2vec:
      embedding_vector = word2vec[word]
      embedding_matrix[idx] = embedding_vector
    else: # 임베딩 모델에 없는 것
      #print(word, "word2vec에 없는 단어입니다.")
      count += 1
      pass


for idx, word in enumerate(title_vocab):
    if word in word2vec:
      embedding_vector = word2vec[word]
      title_embedding_matrix[idx] = embedding_vector
    else: # 임베딩 모델에 없는 것
      #print(word, "word2vec에 없는 단어입니다.")
      count += 1
      pass
      

  
  if __name__ == '__main__':


In [None]:
title_group = (train.groupby(["title"]).count())["n_id"]
train["new_ord"]=train.apply(lambda x: x["ord"]/title_group[x["title"]], axis=1)

In [None]:
title_X = np.concatenate([title_X, train["new_ord"].values.reshape(-1,1)], axis=1)

In [None]:
# 문장별로 train_test set 분리
from sklearn.model_selection import train_test_split

#num_article = len(train["n_id"].unique())
#trainnp.random.randint(0,num_article,int(0.7*num_article))

X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, random_state = 42, test_size = 0.3)
X_title_train, X_title_valid, y_title_train, y_title_valid = train_test_split(title_X, train_y, random_state=42, test_size=0.3)

In [None]:
# EDA기반으로 만든 feature 예측변수로 추가
from keras import regularizers

def title_LSTM2(title_vocab_size, embedding_size = 200, title_max_len=100):
  input1 = keras.layers.Input(shape = [title_max_len,]) #문장 단어 input
  input2 = keras.layers.Input(shape = [1,]) # EDA기반 feature input

  # LSTM
  embedding = keras.layers.Embedding(title_vocab_size, embedding_size, weights = [title_embedding_matrix], input_length = title_max_len)(input1) # 임베딩 가중치 적용
  dropout1 = keras.layers.SpatialDropout1D(0.1)(embedding)
  lstm1 = keras.layers.LSTM(32, return_sequences = True)(dropout1)
  lstm2 = keras.layers.LSTM(32)(lstm1)
  dropout2 = keras.layers.Dropout(0.3)(lstm2)
  #lstm_output = keras.layers.Dense(16, activation = "selu")(dropout2)

  # MLP
  concat = keras.layers.concatenate([dropout2,input2])
  hidden = keras.layers.Dense(16, activation = "selu")(concat)
  output = keras.layers.Dense(1, activation = "sigmoid")(hidden)

  model = keras.Model(inputs = [input1, input2], outputs = [output])

  model.compile(optimizer=keras.optimizers.Adam(lr=learning_rate), loss="binary_crossentropy", metrics = "accuracy")
  model.summary()
  return model


In [None]:
# 훈련 시
tf.random.set_seed(42)


checkpoint_cb = keras.callbacks.ModelCheckpoint("hyerim_add_feature_best_model2.h5",
                                               save_best_only = True)

# 하이퍼파라미터
max_epoch = 50
batch_size = 100
learning_rate = 0.001

model = title_LSTM2(title_vocab_size, title_max_len = title_max_len)
history = model.fit(x=[X_title_train[:,:title_max_len],X_title_train[:,-1:]], y=y_train,epochs=max_epoch,
                batch_size = batch_size,  validation_data = ((X_title_valid[:,:title_max_len],X_title_valid[:,-1:]),y_valid), validation_batch_size = batch_size,
                 callbacks = [checkpoint_cb])

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding_22 (Embedding)        (None, 15, 200)      1585200     input_21[0][0]                   
__________________________________________________________________________________________________
spatial_dropout1d_20 (SpatialDr (None, 15, 200)      0           embedding_22[0][0]               
__________________________________________________________________________________________________
lstm_40 (LSTM)                  (None, 15, 32)       29824       spatial_dropout1d_20[0][0]       
____________________________________________________________________________________________

In [None]:
history = model.fit(x=[X_title_train[:,:title_max_len],X_title_train[:,-1:]], y=y_train,epochs=30,
                batch_size = batch_size,  validation_data = ((X_title_valid[:,:title_max_len],X_title_valid[:,-1:]),y_valid), validation_batch_size = batch_size,
                 callbacks = [checkpoint_cb])
# epoch : 52

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
155/829 [====>.........................] - ETA: 28s - loss: 0.1822 - accuracy: 0.9461

KeyboardInterrupt: ignored

In [None]:
best_model = keras.models.load_model("hyerim_add_feature_best_model2.h5")

In [None]:
predicted = best_model.predict((title_X[:,:title_max_len],title_X[:,-1:]))
train["title_predicted"]=predicted
train.to_csv("try_title_data.csv",index=False, encoding="utf-8-sig")

# 최종 훈련

In [None]:
# 훈련 시
tf.random.set_seed(42)


checkpoint_cb = keras.callbacks.ModelCheckpoint("title_model.h5",
                                               save_best_only = True)

# 하이퍼파라미터
max_epoch = 52
batch_size = 100
learning_rate = 0.001

model = title_LSTM2(title_vocab_size, title_max_len = title_max_len)
history = model.fit(x=[title_X[:,:title_max_len],title_X[:,-1:]], y=train_y, epochs=max_epoch,
                batch_size = batch_size, callbacks = [checkpoint_cb])

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_27 (InputLayer)           [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding_25 (Embedding)        (None, 15, 200)      1585200     input_27[0][0]                   
__________________________________________________________________________________________________
spatial_dropout1d_23 (SpatialDr (None, 15, 200)      0           embedding_25[0][0]               
__________________________________________________________________________________________________
lstm_46 (LSTM)                  (None, 15, 32)       29824       spatial_dropout1d_23[0][0]       
___________________________________________________________________________________________

In [None]:
model.save("title_model.h5")