#**모델 앙상블**

In [None]:
# konlpy Mecab 사용하기

!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

In [2]:
# 내 드라이브에 대한 주소
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
# 내 드라이브에 대한 주소
path = "/gdrive//My Drive/dacon_news/"

train = pd.read_csv(path + "data/news_train.csv")
#try_train = pd.read_csv("try_title_data.csv")
test = pd.read_csv(path + "data/news_test.csv")

# content 맨앞이 [이거나 (이면 0

In [7]:
train["content_startswith_["]=train.content.apply(lambda x : str(x).startswith("[" ) or str(x).startswith("(") or str(x).startswith("제목"))+0

# 타이틀을 이용한 feature
- 해당 title에 몇가지 단어가 들어갈 경우 약 90% 이상이 info가 1

In [13]:
title_noise = ['적중 100%', '글로벌 주요 뉴스', '[전문가 의견]', 
               '[포커스]', '※','■', '▶' ,'◆','★',' TOP', 'BEST',
'전문가의 눈', '전문가선정', '전문가의견','】','후속주도 감사합니다',
               '전문가추천', '주요이슈']

def title_choose(x):
  if ("종목" in x[-6:]) or ("관련주" in x[-5:]):
    return 1
  for noise in title_noise:
    if noise in x.upper():
        return 1
  return 0

In [9]:
train["info1_title"]=train['title'].apply(title_choose)

# content를 이용한 feature

In [10]:
content_noise = ['00%', '긴급공개', '긴급 공개','임상3상', 'TOP','BEST','# ','대장株','대장주','카톡','원"만']

def content_choose(x):
  if (x.count("..")>=2) or (x.count("+")>=2) or (x.count("~")>=2) :
    return 1
  elif (x=='관련기사') or (x=="관련 테마분석") or (x=="코스피") or (x=="코스닥"):
    return 1
  for noise in content_noise:
    if noise in x.upper():
        return 1
  return 0

In [11]:
train["info1_content"]=train["content"].apply(content_choose)

# Order을 이용한 feature

In [12]:
title_group = (train.groupby(["title"]).count())["n_id"]
train["new_ord"]=train.apply(lambda x: x["ord"]/title_group[x["title"]], axis=1)

# tokenization

In [14]:
from konlpy.tag import Mecab
import re
from konlpy.tag import Okt

def text_preprocessing(text_list):
    
    stopwords = []
    tokenizer = Mecab() #형태소 분석기

    token_list = [] 
    
    for text in text_list:
        txt = re.sub("[a-zA-Z0-9]", ' ', text) #영문, 숫자 제거
        txt = re.sub('[가-힣\s]+기자]','기자', txt) #기자 이름 제거
        token = tokenizer.morphs(txt) #형태소 분석

        token = [t for t in token] 
        token_list.append(token)
        
    return token_list, tokenizer

#형태소 분석기를 따로 저장한 이유는 후에 test 데이터 전처리를 진행할 때 이용해야 되기 때문입니다.
train['new_article'], okt = text_preprocessing(train['content'])

In [15]:
# 결측치 제거
train = train[train["new_article"].apply(lambda x: False if len(x)==0 else True)]

# Vectorization

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 60

def text2sequence(train_text, max_len=100):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_text)
    train_X_seq = tokenizer.texts_to_sequences(train_text)
    vocab_size = len(tokenizer.word_index) + 1
    print('vocab_size : ', vocab_size)
    post_X_train = pad_sequences(train_X_seq, maxlen = max_len, truncating="post") # 길이를 맞춰줌
    pre_X_train = pad_sequences(train_X_seq, maxlen = max_len, truncating="pre")
    return pre_X_train, post_X_train, vocab_size, tokenizer

train_y = train['info']
pre_train_X, post_train_X, vocab_size, vectorizer = text2sequence(train['new_article'], max_len = max_len)

print(pre_train_X.shape, post_train_X.shape, vocab_size, train_y.shape)

vocab_size :  33798
(118676, 60) (118676, 60) 33798 (118676,)


# Word2Vec

In [20]:
import gensim
from gensim.models.keyedvectors import KeyedVectors

In [22]:
# 직접 만든 word2vec model
#word2vec_customize = gensim.models.Word2Vec.load("embedding_300_10_30_final.model")
word2vec_customize = gensim.models.Word2Vec.load(path+"embedding/embedding_300_10_30_final.model")
embedding_size = 300

In [None]:
embedding_matrix = np.zeros((vocab_size, embedding_size))
vocab = vectorizer.word_index
count = 0

for idx, word in enumerate(vocab):
    if word in word2vec_customize:
      embedding_vector = word2vec_customize[word]
      embedding_matrix[idx] = embedding_vector
    else: # 임베딩 모델에 없는 것
      #print(word, "word2vec에 없는 단어입니다.")
      count += 1
      pass

print('count: ', count)  

count:  25471


  
  import sys


In [27]:
#concat
feature_num = 4
pre_train_X = np.concatenate([pre_train_X,train[["info1_title","info1_content","new_ord","content_startswith_["]].values.reshape(-1,feature_num)], axis=1)
post_train_X = np.concatenate([post_train_X,train[["info1_title","info1_content","new_ord","content_startswith_["]].values.reshape(-1,feature_num)], axis=1)

In [None]:
index_seq = (X_train.argmax(axis=1)).argsort(axis=0)
X_train = X_train[index_seq]
y_train = y_train.iloc[index_seq]

# 각 모델 예측

In [24]:
# 각 모델 불러오기
pre_model = keras.models.load_model(path+"model/final_model_99.3_front.h5")
post_model = keras.models.load_model(path+"model/final_model_99.3.h5")

In [33]:
# 예측
pre_predicted = pre_model.predict([pre_train_X[:,:max_len], pre_train_X[:,-feature_num:]])
post_predicted = post_model.predict([post_train_X[:,:max_len], post_train_X[:,-feature_num:]])

In [40]:
train["pre_predicted"]=pre_predicted
train["post_predicted"]=post_predicted

# 예측결과 비교

In [46]:
train_XX = train[["pre_predicted","post_predicted"]]
train_y = train["info"]

#train test split

In [48]:
# 문장별로 train_test set 분리
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train_XX, train_y, test_size = 0.3, random_state = 42)

In [54]:
import pickle
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Ensemble

In [107]:
lgbm_model=LGBMClassifier(random_state=42, max_depth = -1)
lgbm_model.fit(train_XX, train_y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [106]:
accuracy_score(y_valid,lgbm_model.predict(X_valid))

0.9993539870235655

# test 예측

In [110]:
path = "/gdrive/My Drive/dacon_news/"
feature_num = 4
test = pd.read_csv(path+"data/news_test.csv")

# EDA 기반 feature 추가
test["info1_title"]=test["title"].apply(title_choose)
test["info1_content"]=test["content"].apply(content_choose)
test["content_startswith_["]=test.content.apply(lambda x : str(x).startswith("[" ) or str(x).startswith("(") or str(x).startswith("제목"))+0
title_group = (test.groupby(["title"]).count())["n_id"]
test["new_ord"]=test.apply(lambda x: x["ord"]/title_group[x["title"]], axis=1)
#test["info1_words"]=test["content"].apply(in_bull)

# 전처리
test['new_article'], okt = text_preprocessing(test['content'])
test_X_seq = vectorizer.texts_to_sequences(test["new_article"])
pre_test_X = pad_sequences(test_X_seq, maxlen = max_len, truncating="pre") # 앞부분
post_test_X = pad_sequences(test_X_seq, maxlen = max_len, truncating="post") # 뒷부분

# concat
pre_test_X = np.concatenate([pre_test_X,test[["info1_title","info1_content","new_ord","content_startswith_["]].values.reshape(-1,feature_num)], axis=1)
post_test_X = np.concatenate([post_test_X,test[["info1_title","info1_content","new_ord","content_startswith_["]].values.reshape(-1,feature_num)], axis=1)

In [111]:
# 각 최종 모델 불러오기
pre_final_model = keras.models.load_model(path+"model/final_model_99.3_front.h5")
post_final_model = keras.models.load_model(path+"model/final_model_99.3.h5")

pre_predicted = pre_final_model.predict([pre_test_X[:,:max_len],pre_test_X[:,-feature_num:]])
post_predicted = post_final_model.predict([post_test_X[:,:max_len],post_test_X[:,-feature_num:]])

In [113]:
final_predicted = lgbm_model.predict(np.concatenate([pre_predicted,post_predicted], axis=1))
test["info"]=final_predicted

In [120]:
test["info"][test["content"].apply(lambda x: True if ('http://etoday.bujane.co.kr/' in x) or ('http://bit.ly/2XrAuGJ_itoozanews' in x) or ('http://www.hisl.co.kr/0306/' in x) or ('https://www.hankyung.com/election2020/' in x) or (x==']]') else False)]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


# Submission

In [121]:
submission = pd.read_csv(path+"/data/sample_submission.csv")
submission["info"]=test['info']
submission.to_csv("submission.csv", encoding="utf-8-sig",index=False)

In [122]:
submission

Unnamed: 0,id,info
0,NEWS00237_1,0
1,NEWS00237_2,0
2,NEWS00237_3,0
3,NEWS00237_4,0
4,NEWS00237_5,0
...,...,...
142560,NEWS09482_72,1
142561,NEWS09482_73,1
142562,NEWS09482_74,1
142563,NEWS09482_75,1
