In [76]:
import pandas as pd
from konlpy.tag import Okt
from tqdm import tqdm
from collections import Counter
import numpy as np

In [4]:
train      = pd.read_csv("dataset/dacon_news/train_data.csv")
test       = pd.read_csv("dataset/dacon_news/test_data.csv")
submission = pd.read_csv("dataset/dacon_news/sample_submission.csv")
topic_dict = pd.read_csv("dataset/dacon_news/topic_dict.csv")

In [51]:
train

Unnamed: 0,index,title,topic_idx
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4
...,...,...,...
45649,45649,KB금융 미국 IB 스티펠과 제휴…선진국 시장 공략,1
45650,45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,2
45651,45651,게시판 키움증권 2020 키움 영웅전 실전투자대회,1
45652,45652,답변하는 배기동 국립중앙박물관장,2


In [5]:
# 형태소 분석기(Okt) 불러오기 
okt=Okt() 

# 형태소 분석: 명사만 추출하기

In [8]:
nouns_list = []
for i in tqdm(range(len(train))):
    nouns_list.append(okt.nouns(train['title'].loc[i]))

In [10]:
total_word = []
for i in range(len(nouns_list)):
    for word in nouns_list[i]:
        total_word.append(word)

In [13]:
Counter(total_word).most_common(30)

[('종합', 4033),
 ('대통령', 1723),
 ('한국', 1456),
 ('명', 1416),
 ('위', 1204),
 ('첫', 999),
 ('삼성', 904),
 ('전', 892),
 ('등', 892),
 ('보', 849),
 ('이란', 803),
 ('감독', 778),
 ('출시', 778),
 ('경기', 758),
 ('게시판', 735),
 ('트럼프', 702),
 ('신간', 686),
 ('것', 685),
 ('중', 618),
 ('정부', 603),
 ('투자', 599),
 ('개발', 594),
 ('개', 586),
 ('서울', 573),
 ('지원', 557),
 ('제', 540),
 ('더', 521),
 ('최고', 517),
 ('주', 512),
 ('중국', 511)]

# 0번 토픽 문서에서 가장 많이 나온 명사 추출하기

In [24]:
topic0 = train[train['topic_idx'] == 0].index
topipc0_total_word = []
for i in topic0:
    for word in nouns_list[i]:
        topipc0_total_word.append(word)

In [25]:
Counter(topipc0_total_word).most_common(10)

[('출시', 515),
 ('개발', 421),
 ('삼성', 347),
 ('기술', 330),
 ('종합', 324),
 ('갤럭시', 279),
 ('네이버', 223),
 ('서비스', 216),
 ('전자', 192),
 ('스마트폰', 188)]

# 토큰나이져

In [37]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(nouns_list)

In [1]:
# tokenizer.word_index

In [2]:
# print(tokenizer.word_counts)

In [40]:
# 전체 단어 개수
print(len(tokenizer.word_counts))

24342


In [41]:
threshold = 3
rare_cnt = 0
for i,j in tokenizer.word_counts.items():
    if j < threshold:
        rare_cnt +=1 

print(rare_cnt)

12218


In [42]:
print(len(tokenizer.word_counts)-rare_cnt)

12124


In [43]:
vocab_size = len(tokenizer.word_counts)-rare_cnt
tokenizer = Tokenizer(vocab_size)
tokenizer.fit_on_texts(nouns_list)
X_train = tokenizer.texts_to_sequences(nouns_list)

In [3]:
# X_train

In [45]:
len_list = []
for i in range(len(X_train)):
    len_list.append(len(X_train[i]))

In [47]:
np.mean(len_list)

6.952293336837955

In [48]:
np.std(len_list)

1.9761805778895212

In [49]:
np.max(len_list)

15

In [50]:
# 패딩 -> 데이터 개수 맞추기
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(X_train, maxlen=15)

In [62]:
# 분류할 클래스가 3개 이상이고, 순서로 이루어진 데이터가 아닐때
from keras.utils import np_utils
y_train = np_utils.to_categorical(train['topic_idx'])

# 모델링

In [63]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 10
hidden_units = 64

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(7, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('dacon_best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

history = model.fit(X_train, y_train, epochs=2000, callbacks=[es, mc], batch_size=64, validation_split=0.2)

Epoch 1/2000

Epoch 00001: val_acc improved from -inf to 0.63553, saving model to dacon_best_model.h5
Epoch 2/2000

Epoch 00002: val_acc improved from 0.63553 to 0.64889, saving model to dacon_best_model.h5
Epoch 3/2000

Epoch 00003: val_acc improved from 0.64889 to 0.72270, saving model to dacon_best_model.h5
Epoch 4/2000

Epoch 00004: val_acc did not improve from 0.72270
Epoch 5/2000

Epoch 00005: val_acc did not improve from 0.72270
Epoch 6/2000

Epoch 00006: val_acc did not improve from 0.72270
Epoch 7/2000

Epoch 00007: val_acc did not improve from 0.72270
Epoch 8/2000

Epoch 00008: val_acc did not improve from 0.72270
Epoch 9/2000

Epoch 00009: val_acc did not improve from 0.72270
Epoch 10/2000

Epoch 00010: val_acc did not improve from 0.72270
Epoch 11/2000

Epoch 00011: val_acc did not improve from 0.72270
Epoch 12/2000

Epoch 00012: val_acc did not improve from 0.72270
Epoch 13/2000

Epoch 00013: val_acc did not improve from 0.72270
Epoch 00013: early stopping


In [64]:
# 모델 불러오기
loaded_model = load_model('dacon_best_model.h5')

# 테스트 데이터셋 전처리 훈련과 똑같이 적용

In [65]:
nouns_list = []
for i in tqdm(range(len(test))):
    nouns_list.append(okt.nouns(test['title'].loc[i]))
X_test = tokenizer.texts_to_sequences(nouns_list)
X_test = pad_sequences(X_test, maxlen=15)

100%|█████████████████████████████████████████████████████████████████████████████| 9131/9131 [00:19<00:00, 475.47it/s]


In [68]:
tmp_pred = loaded_model.predict(X_test)
pred = np.argmax(tmp_pred, axis = 1)

In [71]:
test['Answer'] = pred

In [72]:
test

Unnamed: 0,index,title,Answer
0,45654,유튜브 내달 2일까지 크리에이터 지원 공간 운영,2
1,45655,어버이날 맑다가 흐려져…남부지방 옅은 황사,3
2,45656,내년부터 국가RD 평가 때 논문건수는 반영 않는다,2
3,45657,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것,2
4,45658,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간,3
...,...,...,...
9126,54780,인천 오후 3시35분 대설주의보…눈 3.1cm 쌓여,3
9127,54781,노래방에서 지인 성추행 외교부 사무관 불구속 입건종합,2
9128,54782,40년 전 부마항쟁 부산 시위 사진 2점 최초 공개,5
9129,54783,게시판 아리랑TV 아프리카개발은행 총회 개회식 생중계,0


In [73]:
submission

Unnamed: 0,index,topic_idx
0,45654,0
1,45655,0
2,45656,0
3,45657,0
4,45658,0
...,...,...
9126,54780,0
9127,54781,0
9128,54782,0
9129,54783,0


In [74]:
submission.topic_idx = pred
submission

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3
...,...,...
9126,54780,3
9127,54781,2
9128,54782,5
9129,54783,0


In [75]:
# 저장하고 제출
submission.to_csv("dataset/dacon_news/submission01.csv", index = False)

# 2. 모델 발전시키기.

# 전체 문서에서 가장 많이 나온 조사/구두점/숫자를 제외한 단어 추출하기

In [77]:
okt.pos(train['title'].loc[0])

[('인천', 'Noun'),
 ('→', 'Foreign'),
 ('핀란드', 'Noun'),
 ('항공기', 'Noun'),
 ('결항', 'Noun'),
 ('…', 'Punctuation'),
 ('휴가', 'Noun'),
 ('철', 'Noun'),
 ('여행객', 'Noun'),
 ('분통', 'Noun')]

In [83]:
word_list = []
for i in tqdm(range(len(train))):
    temp_list = []
    for word in okt.pos(train['title'].loc[i]):
        if word[1] not in ['Punctuation','Number','Josa']:
            temp_list.append(word[0])
    word_list.append(temp_list) 

100%|███████████████████████████████████████████████████████████████████████████| 45654/45654 [01:42<00:00, 445.11it/s]


In [85]:
total_word = []
for i in range(len(word_list)):
    for word in word_list[i]:
        total_word.append(word)

In [86]:
Counter(total_word).most_common(10)

[('종합', 4050),
 ('명', 1792),
 ('대통령', 1723),
 ('美', 1478),
 ('한국', 1456),
 ('대', 1391),
 ('들', 1382),
 ('北', 1310),
 ('전', 1245),
 ('위', 1204)]

In [95]:
df_topic_word = pd.DataFrame()
for i in range(7):
    topic0 = train[train['topic_idx'] == i].index
    topipc0_total_word = []
    for i in topic0:
        for word in word_list[i]:
            topipc0_total_word.append(word)
    df_topic_word[str(i)] = Counter(topipc0_total_word).most_common(20)
df_topic_word.columns = topic_dict['topic']
df_topic_word

topic,IT과학,경제,사회,생활문화,세계,스포츠,정치
0,"(G, 553)","(종합, 654)","(종합, 548)","(신간, 684)","(美, 1273)","(감독, 718)","(대통령, 1408)"
1,"(출시, 515)","(기, 598)","(명, 500)","(축제, 346)","(종합, 994)","(경기, 527)","(北, 1116)"
2,"(개발, 421)","(투자, 486)","(코로나, 372)","(여행, 314)","(명, 796)","(전, 446)","(종합, 1065)"
3,"(LG, 386)","(영업, 444)","(대, 315)","(도, 286)","(이란, 744)","(류현진, 381)","(朴, 644)"
4,"(KT, 358)","(증권, 416)","(들, 300)","(들, 241)","(트럼프, 647)","(연승, 348)","(당, 597)"
5,"(삼성, 347)","(주, 397)","(게시판, 291)","(날씨, 235)","(中, 619)","(위, 341)","(문, 530)"
6,"(AI, 345)","(금융, 343)","(하는, 274)","(한국, 234)","(터키, 352)","(월드컵, 340)","(하는, 400)"
7,"(기술, 330)","(삼성, 325)","(한국, 256)","(주말, 234)","(日, 328)","(첫, 331)","(김정은, 364)"
8,"(종합, 325)","(코스피, 317)","(지원, 232)","(서울, 217)","(사망, 322)","(시즌, 321)","(靑, 330)"
9,"(S, 301)","(대, 314)","(장, 221)","(비, 200)","(중국, 296)","(축구, 307)","(민주, 322)"


In [106]:
stopwords = ['→','~','↑','∼','중','대','명','기','장','등','들']

In [107]:
for i in range(len(word_list)):
    for word in word_list[i]:
        if word in stopwords:
            word_list[i].remove(word)

In [108]:
df_topic_word = pd.DataFrame()
for i in range(7):
    topic0 = train[train['topic_idx'] == i].index
    topipc0_total_word = []
    for i in topic0:
        for word in word_list[i]:
            topipc0_total_word.append(word)
    df_topic_word[str(i)] = Counter(topipc0_total_word).most_common(20)
df_topic_word.columns = topic_dict['topic']
df_topic_word

topic,IT과학,경제,사회,생활문화,세계,스포츠,정치
0,"(G, 553)","(종합, 654)","(종합, 548)","(신간, 684)","(美, 1273)","(감독, 718)","(대통령, 1408)"
1,"(출시, 515)","(투자, 486)","(코로나, 372)","(축제, 346)","(종합, 994)","(경기, 527)","(北, 1116)"
2,"(개발, 421)","(영업, 444)","(게시판, 291)","(여행, 314)","(이란, 744)","(전, 446)","(종합, 1065)"
3,"(LG, 386)","(증권, 416)","(하는, 274)","(도, 286)","(트럼프, 647)","(류현진, 381)","(朴, 644)"
4,"(KT, 358)","(주, 397)","(한국, 256)","(날씨, 235)","(中, 619)","(연승, 348)","(당, 597)"
5,"(삼성, 347)","(금융, 343)","(지원, 232)","(한국, 234)","(터키, 352)","(위, 341)","(문, 530)"
6,"(AI, 345)","(삼성, 325)","(제, 188)","(주말, 234)","(日, 328)","(월드컵, 340)","(하는, 400)"
7,"(기술, 330)","(코스피, 317)","(노조, 168)","(서울, 217)","(사망, 322)","(첫, 331)","(김정은, 364)"
8,"(종합, 325)","(작년, 305)","(전, 166)","(비, 200)","(중국, 296)","(시즌, 321)","(靑, 330)"
9,"(S, 301)","(익, 290)","(기자, 164)","(전국, 194)","(시위, 295)","(축구, 307)","(민주, 322)"


In [109]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(word_list)

In [110]:
threshold = 3
rare_cnt = 0
for i,j in tokenizer.word_counts.items():
    if j < threshold:
        rare_cnt +=1 

print(rare_cnt)

18220


In [111]:
vocab_size = len(tokenizer.word_counts)-rare_cnt
tokenizer = Tokenizer(vocab_size)
tokenizer.fit_on_texts(word_list)
X_train = tokenizer.texts_to_sequences(word_list)

In [112]:
len_list = []
for i in range(len(X_train)):
    len_list.append(len(X_train[i]))

In [114]:
np.mean(len_list)

8.303346913742498

In [115]:
np.std(len_list)

2.1711394498112213

In [116]:
np.max(len_list)

17

In [117]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(X_train, maxlen=17)

In [118]:
from keras.utils import np_utils
y_train = np_utils.to_categorical(train['topic_idx'])

In [119]:
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 100

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Bidirectional(LSTM(units = 64, return_sequences = True))),
model.add(Bidirectional(LSTM(units = 128, return_sequences = True))),
model.add(Bidirectional(LSTM(units = 64))),
model.add(Dense(7, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('dacon_best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

history = model.fit(X_train, y_train, epochs=2000, callbacks=[es, mc], batch_size=64, validation_split=0.2)

Epoch 1/2000

Epoch 00001: val_acc improved from -inf to 0.73475, saving model to dacon_best_model.h5
Epoch 2/2000

Epoch 00002: val_acc improved from 0.73475 to 0.77188, saving model to dacon_best_model.h5
Epoch 3/2000

Epoch 00003: val_acc did not improve from 0.77188
Epoch 4/2000

Epoch 00004: val_acc did not improve from 0.77188
Epoch 5/2000

Epoch 00005: val_acc did not improve from 0.77188
Epoch 6/2000

Epoch 00006: val_acc did not improve from 0.77188
Epoch 7/2000

Epoch 00007: val_acc did not improve from 0.77188
Epoch 8/2000

Epoch 00008: val_acc did not improve from 0.77188
Epoch 9/2000

Epoch 00009: val_acc did not improve from 0.77188
Epoch 10/2000

Epoch 00010: val_acc did not improve from 0.77188
Epoch 11/2000

Epoch 00011: val_acc did not improve from 0.77188
Epoch 12/2000

Epoch 00012: val_acc did not improve from 0.77188
Epoch 00012: early stopping


In [120]:
loaded_model = load_model('dacon_best_model.h5')

In [122]:
word_list = []
for i in tqdm(range(len(test))):
    temp_list = []
    for word in okt.pos(test['title'].loc[i]):
        if word[1] not in ['Punctuation','Number','Josa']:
            temp_list.append(word[0])
    word_list.append(temp_list) 
    
for i in range(len(word_list)):
    for word in word_list[i]:
        if word in stopwords:
            word_list[i].remove(word)
            
X_test = tokenizer.texts_to_sequences(word_list)
X_test = pad_sequences(X_test, maxlen=17)

100%|█████████████████████████████████████████████████████████████████████████████| 9131/9131 [00:18<00:00, 501.21it/s]


In [123]:
tmp_pred = loaded_model.predict(X_test)
pred = np.argmax(tmp_pred, axis = 1)

In [124]:
submission.topic_idx = pred
submission

Unnamed: 0,index,topic_idx
0,45654,1
1,45655,3
2,45656,2
3,45657,0
4,45658,3
...,...,...
9126,54780,3
9127,54781,2
9128,54782,0
9129,54783,2


In [125]:
submission.to_csv("dataset/dacon_news/submission02.csv", index = False)