#**EDA를 통한 변수 추가 모델**

In [None]:
# konlpy Mecab 사용하기

!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

In [2]:
# 내 드라이브에 대한 주소
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
import matplotlib.pyplot as plt

In [49]:
# 내 드라이브에 대한 주소
path = "/gdrive//My Drive/dacon_news/"

train = pd.read_csv(path + "data/news_train.csv")
#try_train = pd.read_csv("try_title_data.csv")
test = pd.read_csv(path + "data/news_test.csv")

# content 맨앞이 [이거나 (이면 0

In [50]:
train["content_startswith_["]=train.content.apply(lambda x : str(x).startswith("[" ) or str(x).startswith("(") or str(x).startswith("제목"))+0

# 타이틀을 이용한 feature
- 해당 title에 몇가지 단어가 들어갈 경우 약 90% 이상이 info가 1

In [305]:
"""
title_noise = ['적중 100%', '글로벌 주요 뉴스', '[전문가 의견]', 
               '[포커스]', '※','■', '▶' ,'◆','★',' TOP', 'BEST',
'전문가의 눈', '전문가선정', '전문가의견','】','후속주도 감사합니다',
               '전문가추천', '주요이슈']

def title_choose(x):
  if ("종목" in x["title"][-6:]) or ("관련주" in x["title"][-5:]):
    return 1
  elif ('00%' in x["content"]) or (x["content"]=="코스피") or (x["content"]=="코스닥"):
    return 1
  elif ('TOP' in x["content"].upper()) or ('BEST' in x["content"].upper()):
    return 1
  for noise in title_noise:
    if noise in x["title"].upper():
      if (x["content"][:2]=="제목") or (x["content"][:1]=="["):
        return 0
      else:
        return 1
  return 0
  """

In [51]:
title_noise = ['적중 100%', '글로벌 주요 뉴스', '[전문가 의견]', 
               '[포커스]', '※','■', '▶' ,'◆','★',' TOP', 'BEST',
'전문가의 눈', '전문가선정', '전문가의견','】','후속주도 감사합니다',
               '전문가추천', '주요이슈']

def title_choose(x):
  if ("종목" in x[-6:]) or ("관련주" in x[-5:]):
    return 1
  for noise in title_noise:
    if noise in x.upper():
        return 1
  return 0

In [52]:
train["info1_title"]=train['title'].apply(title_choose)

# content를 이용한 feature

In [53]:
content_noise = ['00%', '긴급공개', '긴급 공개','임상3상', 'TOP','BEST','# ','대장株','대장주','카톡','원"만']

def content_choose(x):
  if (x=='관련기사') or (x=="관련 테마분석") or (x=="코스피") or (x=="코스닥"):
    return 1
  for noise in content_noise:
    if noise in x.upper():
        return 1
  return 0

In [54]:
train["info1_content"]=train["content"].apply(content_choose)

# Order을 이용한 feature

In [55]:
title_group = (train.groupby(["title"]).count())["n_id"]
train["new_ord"]=train.apply(lambda x: x["ord"]/title_group[x["title"]], axis=1)

# tokenization

In [56]:
from konlpy.tag import Mecab
import re

def text_preprocessing(text_list):
    
    stopwords = [] #불용어 설정
    
    tokenizer = Mecab() #형태소 분석기 
    token_list = [] 
    
    for text in text_list:
        txt = re.sub('[^가-힣]', ' ', text) #한글만 남기고 다른 글자 모두 제거
        txt = re.sub('[가-힣\s]+기자]','기자', txt) #기자 이름 제거
        token = tokenizer.morphs(txt) #형태소 분석

        #형태소 분석 결과 중 stopwords에 해당하지 않고, float type이 아닌 것만 수집
        token = [t for t in token] 
        token_list.append(token)
        
    return token_list, tokenizer

#형태소 분석기를 따로 저장한 이유는 후에 test 데이터 전처리를 진행할 때 이용해야 되기 때문입니다. 
train['new_article'], okt = text_preprocessing(train['content'])

In [57]:
# 결측치 제거
train = train[train["new_article"].apply(lambda x: False if len(x)==0 else True)]

# Vectorization

In [58]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 60

def text2sequence(train_text, max_len=100):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_text)
    train_X_seq = tokenizer.texts_to_sequences(train_text)
    vocab_size = len(tokenizer.word_index) + 1
    print('vocab_size : ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen = max_len, truncating="post") # 길이를 맞춰줌
    return X_train, vocab_size, tokenizer

train_y = train['info']
train_X, vocab_size, vectorizer = text2sequence(train['new_article'], max_len = max_len)

print(train_X.shape, train_y.shape)

vocab_size :  33461
(118414, 60) (118414,)


# Word2Vec

In [14]:
import gensim
from gensim.models.keyedvectors import KeyedVectors

In [15]:
# 구글 word2vec 모델

word2vec_g = gensim.models.KeyedVectors.load_word2vec_format(path+'embedding/GoogleNews-vectors-negative300.bin.gz', binary = True)
embedding_size = 300

In [348]:
# 한국어 word2vec model
#word2vec2 = gensim.models.Word2Vec.load(path+'embedding/ko.bin')
#embedding_size = 200

In [59]:
embedding_matrix = np.zeros((vocab_size, embedding_size))
vocab = vectorizer.word_index
count = 0

for idx, word in enumerate(vocab):
    if word in word2vec_g:
      embedding_vector = word2vec_g[word]
      embedding_matrix[idx] = embedding_vector
    else: # 임베딩 모델에 없는 것
      #print(word, "word2vec에 없는 단어입니다.")
      count += 1
      pass

print('count: ', count)  

count:  33427


#train test split

In [60]:
#concat
feature_num = 4
train_X = np.concatenate([train_X,train[["info1_title","info1_content","new_ord","content_startswith_["]].values.reshape(-1,feature_num)], axis=1)

# 문장별로 train_test set 분리
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, random_state = 42, test_size = 0.3)

In [None]:
index_seq = (X_train.argmax(axis=1)).argsort(axis=0)
X_train = X_train[index_seq]
y_train = y_train.iloc[index_seq]

#모델링

In [61]:
# EDA기반으로 만든 feature 예측변수로 추가
from keras import regularizers

def LSTM_add_feature(vocab_size, embedding_size = 200, max_len=100):
  input1 = keras.layers.Input(shape = [max_len,]) #문장 단어 input
  input2 = keras.layers.Input(shape = [feature_num,]) # EDA기반 feature input

  # LSTM
  embedding = keras.layers.Embedding(vocab_size, embedding_size, weights = [embedding_matrix], input_length = max_len)(input1) # 임베딩 가중치 적용
  dropout1 = keras.layers.SpatialDropout1D(0.5)(embedding)
  lstm1 = keras.layers.LSTM(32, return_sequences = True)(dropout1)
  lstm2 = keras.layers.LSTM(32)(lstm1)
  dropout2 = keras.layers.Dropout(0.5)(lstm2)
  #lstm_output = keras.layers.Dense(16, activation = "selu")(dropout2)

  # MLP
  concat = keras.layers.concatenate([dropout2,input2])
  hidden = keras.layers.Dense(32, activation = "selu")(concat)
  output = keras.layers.Dense(1, activation = "sigmoid")(hidden)

  model = keras.Model(inputs = [input1, input2], outputs = [output])

  model.compile(optimizer=keras.optimizers.Adam(lr=learning_rate), loss="binary_crossentropy", metrics = "accuracy")
  model.summary()
  return model


In [62]:
# 훈련 시
tf.random.set_seed(42)


checkpoint_cb = keras.callbacks.ModelCheckpoint("hyerim_add_feature_best_model2.h5",
                                               save_best_only = True)

# 하이퍼파라미터
max_epoch = 3
batch_size = 32
learning_rate = 0.001

model = LSTM_add_feature(vocab_size, max_len = max_len, embedding_size = embedding_size)
history = model.fit(x=[X_train[:,:max_len],X_train[:,-feature_num:]], y=y_train, epochs=max_epoch,
                batch_size = batch_size,  validation_data = ((X_valid[:,:max_len],X_valid[:,-feature_num:]),y_valid), validation_batch_size = batch_size,
                 callbacks = [checkpoint_cb])

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 60)]         0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 60, 300)      10038300    input_11[0][0]                   
__________________________________________________________________________________________________
spatial_dropout1d_5 (SpatialDro (None, 60, 300)      0           embedding_5[0][0]                
__________________________________________________________________________________________________
lstm_10 (LSTM)                  (None, 60, 32)       42624       spatial_dropout1d_5[0][0]        
____________________________________________________________________________________________

In [None]:
best_model = keras.models.load_model("hyerim_add_feature_best_model2.h5") # 저장된 모델 불러오기
plt.plot(pd.DataFrame(history.history))

In [None]:
# 성능
best_model.evaluate((X_valid[:,:-feature_num],X_valid[:,-feature_num:]),y_valid)



# 최종 훈련

In [40]:
# 실제 예측시
index_seq = (train_X.argmax(axis=1)).argsort(axis=0)
train_X = train_X[index_seq]
train_y = train_y.iloc[index_seq]


tf.random.set_seed(2020)

# 하이퍼파라미터
max_epoch = 2
batch_size = 32
learning_rate = 0.001

model = LSTM_add_feature(vocab_size, max_len = max_len, embedding_size = embedding_size)
history = model.fit(x=[train_X[:,:-feature_num],train_X[:,-feature_num:]], y=train_y,epochs=max_epoch,batch_size = batch_size)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 60, 300)      10038300    input_7[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDro (None, 60, 300)      0           embedding_3[0][0]                
__________________________________________________________________________________________________
lstm_6 (LSTM)                   (None, 60, 32)       42624       spatial_dropout1d_3[0][0]        
____________________________________________________________________________________________

In [41]:
model.save("final_model_99.04.h5")

# 전문장, 전전문장의 예측결과값을 이용하여 예측값 보정

In [63]:
final_model = keras.models.load_model("final_model_99.04.h5")

In [43]:
train_predicted = final_model.predict((train_X[:,:max_len],train_X[:,-feature_num:]))

# 각 문장의 바로 앞문장과, 그 앞 문장의 예측값 생성
train_pre_predicted1 = np.array([train_predicted[idx-1][0] for idx in range(len(train_predicted))]).reshape(-1,1) # 앞문장
train_pre_predicted2 = np.array([train_predicted[idx-2][0] for idx in range(len(train_predicted))]).reshape(-1,1) # 앞앞문장
train_pre_predicted3 = np.array([train_predicted[idx-3][0] for idx in range(len(train_predicted))]).reshape(-1,1) # 앞앞앞문장

In [386]:
# ord가 1이면 과거 1년도, 2년도 자기값으로 대체
# ord가 2이면 과거 2년도 자기값으로 대체

train["predicted"]=train_predicted
train["pre_predicted1"]=train_pre_predicted1
train["pre_predicted2"]=train_pre_predicted2

train.loc[train["ord"]==1,"pre_predicted1"]=train_predicted[train["ord"]==1]
train.loc[train["ord"]==1,"pre_predicted2"]=train_predicted[train["ord"]==1]
train.loc[train["ord"]==2,"pre_predicted2"]=train_predicted[train["ord"]==2]

In [269]:
train[["predicted","pre_predicted1","pre_predicted2","info"]].to_csv("new_train2.csv",index=False)

In [387]:
# tree 모델 불러오기
import pickle
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# xgb model 예측
xgb_model=XGBClassifier(random_state=42)
lgbm_model=LGBMClassifier(random_state=42)

with open(path+'model/xgb_model.pkl', 'wb') as f:
  pickle.dump(xgb_model, f)
with open(path+'model/lgbm_model.pkl','wb') as f:
  pickle.dump(lgbm_model,f)

xgb_model.fit(train[["predicted","pre_predicted1","pre_predicted2"]], y=train["info"])
lgbm_model.fit(train[["predicted","pre_predicted1","pre_predicted2"]], y=train["info"])

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
def Lag_MLP():
  model = keras.models.Sequential([
    keras.layers.Dense(16, input_shape = (3,)),
    keras.layers.Dense(1)                
  ])
  model.compile(optimizer=keras.optimizers.Adam(lr=learning_rate), loss="binary_crossentropy", metrics = "accuracy")
  model.summary()
  return model

In [None]:
# 시퀀셜 LSTM 모델

def Lag_RNN():
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape=(3,1)))
    model.add(keras.layers.SimpleRNN(32))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
    model.summary()
    return model

In [None]:
# 하이퍼파라미터
tf.random.set_seed(42)

max_epoch = 5
batch_size = 32
learning_rate = 0.001

lag_model = Lag_RNN()
lag_model.fit(x=train[:82858][["predicted","pre_predicted1","pre_predicted2"]].values.reshape(-1,3,1), y=train[:82858]["info"],
              validation_data = (train[82858:][["predicted","pre_predicted1","pre_predicted2"]].values.reshape(-1,3,1),train[82858:]["info"]), batch_size = 16,
              epochs = max_epoch)

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_7 (SimpleRNN)     (None, 32)                1088      
_________________________________________________________________
dense_31 (Dense)             (None, 1)                 33        
Total params: 1,121
Trainable params: 1,121
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f39e0bf3198>

In [None]:
# 하이퍼파라미터
tf.random.set_seed(42)

max_epoch = 5
batch_size = 32
learning_rate = 0.0001

lag_model = Lag_MLP()
lag_model.fit(x=train[:82858][["predicted","pre_predicted1","pre_predicted2"]], y=train[:82858]["info"],
              validation_data = (train[82858:][["predicted","pre_predicted1","pre_predicted2"]],train[82858:]["info"]), batch_size = batch_size,
              epochs = max_epoch, validation_batch_size = batch_size)

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_28 (Dense)             (None, 16)                64        
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 17        
Total params: 81
Trainable params: 81
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f39f2c0b0b8>

# test 예측

In [388]:
path = "/gdrive/My Drive/dacon_news/"
feature_num = 4
test = pd.read_csv(path+"data/news_test.csv")

# EDA 기반 feature 추가
test["info1_title"]=test["title"].apply(title_choose)
test["info1_content"]=test["content"].apply(content_choose)
test["content_startswith_["]=test.content.apply(lambda x : str(x).startswith("[" ) or str(x).startswith("(") or str(x).startswith("제목"))+0
title_group = (test.groupby(["title"]).count())["n_id"]
test["new_ord"]=test.apply(lambda x: x["ord"]/title_group[x["title"]], axis=1)
#test["info1_words"]=test["content"].apply(in_bull)

# 전처리
test['new_article'], okt = text_preprocessing(test['content'])
test_X_seq = vectorizer.texts_to_sequences(test["new_article"])
test_X = pad_sequences(test_X_seq, maxlen = max_len) # 길이를 맞춰줌

# concat
test_X = np.concatenate([test_X,test[["info1_title","info1_content","new_ord","content_startswith_["]].values.reshape(-1,feature_num)], axis=1)

In [389]:
#final_model = keras.models.load_model(path+"model/final_model_99_epoch2.h5") # 저장된 모델 불러오기
final_model = keras.models.load_model("final_model_99.04.h5")
predicted = final_model.predict([test_X[:,:max_len],test_X[:,-feature_num:]])

In [390]:
# idx-1의 예측값
pre_predicted1 = np.array([predicted[idx-1][0] for idx in range(len(predicted))]).reshape(-1,1)
pre_predicted2 = np.array([predicted[idx-2][0] for idx in range(len(predicted))]).reshape(-1,1)

test["predicted"]=predicted
test["pre_predicted1"]=pre_predicted1
test["pre_predicted2"]=pre_predicted2

# ord가 1이면 과거 1년도, 2년도 자기값으로 대체
# ord가 2이면 과거 2년도 자기값으로 대체
test.loc[test["ord"]==1,"pre_predicted1"]=predicted[test["ord"]==1]
test.loc[test["ord"]==1,"pre_predicted2"]=predicted[test["ord"]==1]
test.loc[test["ord"]==2,"pre_predicted2"]=predicted[test["ord"]==2]

In [391]:
xgb_final_predicted = xgb_model.predict(test[["predicted","pre_predicted1","pre_predicted2"]])
lgbm_final_predicted = lgbm_model.predict(test[["predicted","pre_predicted1","pre_predicted2"]])

In [396]:
test["info"]=lgbm_final_predicted
#test["info"][test["content"].apply(lambda x: True if "무단전재 & 재배포 금지" in x else False)]=0
test["info"][test["content"].apply(lambda x: True if ('http://etoday.bujane.co.kr/' in x) or ('http://bit.ly/2XrAuGJ_itoozanews' in x) or ('http://www.hisl.co.kr/0306/' in x) or ('https://www.hankyung.com/election2020/' in x) or (x==']]') else False)]=1
#test["info"][test["predicted"]>=0.6]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# Submission

In [405]:
submission = pd.read_csv(path+"/data/sample_submission.csv")
submission["info"]=predicted
submission["info"]=(predicted>=0.6)+0
submission.to_csv("submission.csv", encoding="utf-8-sig",index=False)

In [286]:
submission["info"]=(predicted>=0.6)+0

In [288]:
submission.to_csv("xgb_drop0.5_submission.csv", encoding="utf-8-sig",index=False)

In [394]:
submission

Unnamed: 0,id,info
0,NEWS00237_1,0
1,NEWS00237_2,0
2,NEWS00237_3,0
3,NEWS00237_4,0
4,NEWS00237_5,0
...,...,...
142560,NEWS09482_72,1
142561,NEWS09482_73,1
142562,NEWS09482_74,1
142563,NEWS09482_75,1


# XGBoost와 LGBM의 예측결과가 다른 것 추출 후 살펴보기

In [None]:
wrong = test[(xgb_final_predicted)!=(lgbm_final_predicted)]

In [None]:
wrong["xgb_predicted"]=xgb_final_predicted[xgb_final_predicted!=lgbm_final_predicted]
wrong["lgbm_predicte"]=lgbm_final_predicted[xgb_final_predicted!=lgbm_final_predicted]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
wrong_train = train[(xgb_train_p)!=(lgbm_train_p)]

wrong_train["xgb_predicted"]=xgb_train_p[xgb_train_p!=lgbm_train_p]
wrong_train["lgbm_predicted"]=lgbm_train_p[xgb_train_p!=lgbm_train_p]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
