In [22]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim import corpora

In [2]:
train      = pd.read_csv("dataset/dacon_news/train_data.csv")
test       = pd.read_csv("dataset/dacon_news/test_data.csv")
submission = pd.read_csv("dataset/dacon_news/sample_submission.csv")
topic_dict = pd.read_csv("dataset/dacon_news/topic_dict.csv")

In [3]:
train

Unnamed: 0,index,title,topic_idx
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4
...,...,...,...
45649,45649,KB금융 미국 IB 스티펠과 제휴…선진국 시장 공략,1
45650,45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,2
45651,45651,게시판 키움증권 2020 키움 영웅전 실전투자대회,1
45652,45652,답변하는 배기동 국립중앙박물관장,2


In [4]:
topic_dict

Unnamed: 0,topic,topic_idx
0,IT과학,0
1,경제,1
2,사회,2
3,생활문화,3
4,세계,4
5,스포츠,5
6,정치,6


## Pandas - Merge

<img src="https://devskrol.com/wp-content/uploads/2020/10/mERGE.png"><br/>

In [5]:
df1 = pd.DataFrame({'ID' : ['SP01','SP02','SP03','SP04'],
                    'Name': ['Bob', 'Bob', 'Lisa','Harry'],
                    'Department': ['Accounting', 'Engineering', 'Engineering', "HR"]})
df2 = pd.DataFrame({'ID' : ['SP01','SP02','SP03','SP05'],
                    'Name': ['Bob','Bob', 'Lisa',"Clair"],
                    'hire_date': [2004, 2008, 2012, 2009]})
display(df1, df2)

Unnamed: 0,ID,Name,Department
0,SP01,Bob,Accounting
1,SP02,Bob,Engineering
2,SP03,Lisa,Engineering
3,SP04,Harry,HR


Unnamed: 0,ID,Name,hire_date
0,SP01,Bob,2004
1,SP02,Bob,2008
2,SP03,Lisa,2012
3,SP05,Clair,2009


In [6]:
pd.merge(df1,df2, how='left', left_on=df1.ID, right_on=df2.ID)

Unnamed: 0,key_0,ID_x,Name_x,Department,ID_y,Name_y,hire_date
0,SP01,SP01,Bob,Accounting,SP01,Bob,2004.0
1,SP02,SP02,Bob,Engineering,SP02,Bob,2008.0
2,SP03,SP03,Lisa,Engineering,SP03,Lisa,2012.0
3,SP04,SP04,Harry,HR,,,


In [7]:
pd.merge(df1,df2, how='right', left_on=df1.ID, right_on=df2.ID)

Unnamed: 0,key_0,ID_x,Name_x,Department,ID_y,Name_y,hire_date
0,SP01,SP01,Bob,Accounting,SP01,Bob,2004
1,SP02,SP02,Bob,Engineering,SP02,Bob,2008
2,SP03,SP03,Lisa,Engineering,SP03,Lisa,2012
3,SP05,,,,SP05,Clair,2009


In [8]:
pd.merge(df1,df2, how='inner', left_on=df1.ID, right_on=df2.ID)

Unnamed: 0,key_0,ID_x,Name_x,Department,ID_y,Name_y,hire_date
0,SP01,SP01,Bob,Accounting,SP01,Bob,2004
1,SP02,SP02,Bob,Engineering,SP02,Bob,2008
2,SP03,SP03,Lisa,Engineering,SP03,Lisa,2012


In [9]:
pd.merge(df1,df2, how='outer', left_on=df1.ID, right_on=df2.ID)

Unnamed: 0,key_0,ID_x,Name_x,Department,ID_y,Name_y,hire_date
0,SP01,SP01,Bob,Accounting,SP01,Bob,2004.0
1,SP02,SP02,Bob,Engineering,SP02,Bob,2008.0
2,SP03,SP03,Lisa,Engineering,SP03,Lisa,2012.0
3,SP04,SP04,Harry,HR,,,
4,SP05,,,,SP05,Clair,2009.0


In [11]:
pd.merge(train,topic_dict, how='left', left_on=train.topic_idx, right_on=topic_dict.topic_idx)[['index','title','topic']]

Unnamed: 0,index,title,topic
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,세계
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,세계
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,세계
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,세계
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,세계
...,...,...,...
45649,45649,KB금융 미국 IB 스티펠과 제휴…선진국 시장 공략,경제
45650,45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,사회
45651,45651,게시판 키움증권 2020 키움 영웅전 실전투자대회,경제
45652,45652,답변하는 배기동 국립중앙박물관장,사회


# Topic Modeling


In [12]:
from konlpy.tag import Okt
okt = Okt()

In [13]:
print(train['title'].loc[0])

인천→핀란드 항공기 결항…휴가철 여행객 분통


In [14]:
okt.nouns(train['title'].loc[0])

['인천', '핀란드', '항공기', '결항', '휴가', '철', '여행객', '분통']

In [15]:
nouns_list = []
for i in tqdm(range(len(train))):
    nouns_list.append(okt.nouns(train['title'].loc[i]))

100%|███████████████████████████████████████████████████████████████████████████| 45654/45654 [00:51<00:00, 891.80it/s]


In [18]:
total_word = []
for i in range(len(nouns_list)):
    for word in nouns_list[i]:
        total_word.append(word)

In [20]:
import collections
collections.Counter(total_word).most_common(20)

[('종합', 4033),
 ('대통령', 1723),
 ('한국', 1456),
 ('명', 1416),
 ('위', 1204),
 ('첫', 999),
 ('삼성', 904),
 ('전', 892),
 ('등', 892),
 ('보', 849),
 ('이란', 803),
 ('감독', 778),
 ('출시', 778),
 ('경기', 758),
 ('게시판', 735),
 ('트럼프', 702),
 ('신간', 686),
 ('것', 685),
 ('중', 618),
 ('정부', 603)]

In [23]:
dictionary = corpora.Dictionary(nouns_list)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1b8816377c0>

In [72]:
for i in dictionary.items():
    print(i)
    break

(0, '결항')


In [27]:
corpus = [dictionary.doc2bow(text) for text in nouns_list]

In [40]:
nouns_list[11]

['이란', '최고', '지도자', '모욕', '혐의', '미국인', '징역', '선고']

In [41]:
corpus[11]

[(18, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1)]

In [51]:
import gensim
NUM_TOPICS = 7 # 20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, random_state=100)

In [45]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.011*"영업" + 0.010*"종합" + 0.009*"대통령" + 0.008*"작년"')
(1, '0.024*"명" + 0.015*"종합" + 0.007*"도" + 0.006*"사망"')
(2, '0.012*"교육청" + 0.010*"코로나" + 0.009*"서울" + 0.009*"명"')
(3, '0.010*"종합" + 0.009*"부산" + 0.009*"신간" + 0.009*"학생"')
(4, '0.014*"종합" + 0.008*"노동자" + 0.008*"대통령" + 0.008*"경기"')
(5, '0.010*"증권" + 0.010*"사업" + 0.009*"투자" + 0.009*"주"')
(6, '0.009*"종합" + 0.008*"교수" + 0.008*"대전" + 0.008*"대구"')


In [52]:
corpus[0]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]

In [53]:
nouns_list[1]

['실리콘밸리', '구글', '전역', '거점']

In [54]:
ldamodel[corpus[0]]

[(0, 0.14461014),
 (1, 0.47380725),
 (2, 0.31798923),
 (3, 0.015898317),
 (4, 0.015896983),
 (5, 0.015901517),
 (6, 0.015896618)]

In [55]:
ldamodel[corpus[1]]

[(0, 0.028582988),
 (1, 0.62722576),
 (2, 0.028583093),
 (3, 0.028582985),
 (4, 0.22975972),
 (5, 0.028582985),
 (6, 0.028682495)]

In [56]:
corpus[1]

[(8, 1), (9, 1), (10, 1), (11, 1)]

In [61]:
sentence = '서울 노조 최 수능'
my_corpus = dictionary.doc2bow(okt.nouns(sentence))
ldamodel[my_corpus]

[(0, 0.028584873),
 (1, 0.0285884),
 (2, 0.028572422),
 (3, 0.028571866),
 (4, 0.028593555),
 (5, 0.02857193),
 (6, 0.828517)]

In [62]:
import pyLDAvis.gensim_models

In [63]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

  default_term_info = default_term_info.sort_values(


In [64]:
pyLDAvis.save_html(vis,'dataset/lda.html')

In [68]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel,texts=nouns_list, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.39529890371647447
