In [99]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# CSV 파일 불러오기
data = pd.read_csv("abcnews-date-text.csv")
data

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
...,...,...
1244179,20211231,two aged care residents die as state records 2...
1244180,20211231,victoria records 5;919 new cases and seven deaths
1244181,20211231,wa delays adopting new close contact definition
1244182,20211231,western ringtail possums found badly dehydrate...


In [100]:
# 필요한 열 선택 (headline_text: 기사 텍스트, category: 카테고리)
data = data[['headline_text']]

In [101]:
# 텍스트 정제 함수 (불용어 제거와 1글자짜리 단어 제거)
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # 영문자와 공백 이외의 문자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 소문자 변환
    text = text.lower()
    # 불용어 제거 및 2글자 이하 단어 제거
    text = ' '.join(word for word in text.split() if word not in stop_words and len(word) > 2)
    return text

In [102]:
# 텍스트 정제 적용
data['headline_text'] = data['headline_text'].apply(clean_text)
data.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['headline_text'] = data['headline_text'].apply(clean_text)


Unnamed: 0,headline_text
0,aba decides community broadcasting licence
1,act fire witnesses must aware defamation
2,calls infrastructure protection summit
3,air staff aust strike pay rise
4,air strike affect australian travellers


In [103]:
# 동사 시제에 따라 변한 것을 동사 원형으로 만드는 장겁
data['headline_text'] = data.apply(lambda x: nltk.word_tokenize(x['headline_text']), axis=1)
data['headline_text'] = data['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
data.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['headline_text'] = data.apply(lambda x: nltk.word_tokenize(x['headline_text']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['headline_text'] = data['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])


Unnamed: 0,headline_text
0,"[aba, decide, community, broadcast, licence]"
1,"[act, fire, witness, must, aware, defamation]"
2,"[call, infrastructure, protection, summit]"
3,"[air, staff, aust, strike, pay, rise]"
4,"[air, strike, affect, australian, travellers]"


In [104]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(data)):
    t = ' '.join(data['headline_text'][i])
    detokenized_doc.append(t)

In [105]:
# 다시 text['headline_text']에 재저장
data['headline_text'] = detokenized_doc
data.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['headline_text'] = detokenized_doc


Unnamed: 0,headline_text
0,aba decide community broadcast licence
1,act fire witness must aware defamation
2,call infrastructure protection summit
3,air staff aust strike pay rise
4,air strike affect australian travellers


In [113]:
# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer(max_features=10000, max_df=0.85)
X = tfidf_vectorizer.fit_transform(data['headline_text'])

In [114]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [115]:
tfidf_tokens = []
for i in range(len(data)):
    doc = X[i, :].toarray()
    tokens = [tfidf_feature_names[j] for j in np.where(doc > 0)[1]]
    tfidf_tokens.append(tokens)

In [117]:
# 토큰화된 결과를 데이터프레임에 추가
data['tfidf_tokens'] = tfidf_tokens

In [118]:
data.head()

Unnamed: 0,headline_text,tfidf_tokens
0,aba decide community broadcast licence,"[aba, broadcast, community, decide, licence]"
1,act fire witness must aware defamation,"[act, aware, defamation, fire, must, witness]"
2,call infrastructure protection summit,"[call, infrastructure, protection, summit]"
3,air staff aust strike pay rise,"[air, aust, pay, rise, staff, strike]"
4,air strike affect australian travellers,"[affect, air, australian, strike, travellers]"


In [119]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary

In [120]:
# Dictionary 및 corpus 생성
dictionary = Dictionary(tfidf_tokens)
corpus = [dictionary.doc2bow(text) for text in tfidf_tokens]
print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

[(5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]


In [122]:
print(dictionary[66], len(dictionary))

water 10000


In [123]:
import gensim
NUM_TOPICS = 6 # 6개의 토픽
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.023*"australian" + 0.023*"australia" + 0.022*"trump" + 0.016*"coronavirus" + 0.016*"test" + 0.015*"vaccine" + 0.014*"melbourne" + 0.013*"live" + 0.013*"border" + 0.011*"day"')
(1, '0.032*"police" + 0.028*"queensland" + 0.013*"nsw" + 0.012*"two" + 0.012*"coast" + 0.011*"kill" + 0.011*"sydney" + 0.011*"find" + 0.010*"crash" + 0.010*"morrison"')
(2, '0.073*"covid" + 0.024*"coronavirus" + 0.017*"australia" + 0.014*"china" + 0.013*"say" + 0.012*"first" + 0.012*"state" + 0.011*"new" + 0.011*"warn" + 0.008*"speak"')
(3, '0.015*"change" + 0.015*"donald" + 0.013*"lockdown" + 0.013*"say" + 0.012*"people" + 0.010*"quarantine" + 0.010*"health" + 0.010*"national" + 0.009*"concern" + 0.009*"minister"')
(4, '0.029*"case" + 0.017*"government" + 0.013*"home" + 0.013*"court" + 0.013*"news" + 0.013*"charge" + 0.011*"man" + 0.011*"die" + 0.010*"face" + 0.010*"murder"')
(5, '0.030*"new" + 0.028*"coronavirus" + 0.023*"victoria" + 0.019*"nsw" + 0.016*"record" + 0.014*"election" + 0.013*"restrictions" 

In [None]:
pip install pyLDAvis

In [124]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

문서 별 토픽 분포 보기

In [125]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)
    # doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
    doc = sorted(topic_list, key=lambda x: (x[1]), reverse=True)
    print(doc)
    

0 번째 문서의 topic 비율은 [(0, 0.20452273), (1, 0.029229345), (2, 0.029229375), (3, 0.55547667), (4, 0.029289646), (5, 0.15225221)]
[(3, 0.55547667), (0, 0.20452273), (5, 0.15225221), (4, 0.029289646), (2, 0.029229375), (1, 0.029229345)]
1 번째 문서의 topic 비율은 [(0, 0.3090777), (1, 0.024005834), (2, 0.023827529), (3, 0.023827529), (4, 0.59531456), (5, 0.02394689)]
[(4, 0.59531456), (0, 0.3090777), (1, 0.024005834), (5, 0.02394689), (2, 0.023827529), (3, 0.023827529)]
2 번째 문서의 topic 비율은 [(0, 0.033341743), (1, 0.033341743), (2, 0.26875195), (3, 0.5977276), (4, 0.0334126), (5, 0.03342436)]
[(3, 0.5977276), (2, 0.26875195), (5, 0.03342436), (4, 0.0334126), (0, 0.033341743), (1, 0.033341743)]
3 번째 문서의 topic 비율은 [(0, 0.023835283), (1, 0.02391323), (2, 0.3869669), (3, 0.19881777), (4, 0.023917923), (5, 0.34254885)]
[(2, 0.3869669), (5, 0.34254885), (3, 0.19881777), (4, 0.023917923), (1, 0.02391323), (0, 0.023835283)]
4 번째 문서의 topic 비율은 [(0, 0.19448482), (1, 0.02801508), (2, 0.028502064), (3, 0.36052954),

In [126]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        if i == 1000:
            break
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = pd.concat([topic_table, pd.DataFrame([int(topic_num), round(prop_topic,4), doc]).T], ignore_index= True)
                # pd.DataFrame({'가장 비중이 높은 토픽' : int(topic_num), '가장 높은 토픽의 비중' : round(prop_topic,4), '각 토픽의 비중':topic_list})
                # print(topic_table)
                # topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return topic_table

In [127]:
topictable = make_topictable_per_doc(ldamodel, corpus)

In [128]:
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['num', 'best_topic', 'best_topic_rate', 'topic_rate']

In [129]:
topictable

Unnamed: 0,num,best_topic,best_topic_rate,topic_rate
0,0,3,0.5553,"[(3, 0.555338), (0, 0.20447157), (5, 0.1524643..."
1,1,4,0.5953,"[(4, 0.5953144), (0, 0.30907768), (1, 0.024005..."
2,2,3,0.5977,"[(3, 0.5977433), (2, 0.26873627), (5, 0.033424..."
3,3,2,0.387,"[(2, 0.38697717), (5, 0.34254503), (3, 0.19881..."
4,4,4,0.3607,"[(4, 0.3606937), (3, 0.36053774), (0, 0.194484..."
...,...,...,...,...
995,995,0,0.4328,"[(0, 0.43279567), (3, 0.23354031), (4, 0.23351..."
996,996,3,0.7913,"[(3, 0.79130876), (2, 0.04188438), (4, 0.04179..."
997,997,0,0.3623,"[(0, 0.3622772), (3, 0.3099698), (4, 0.2429436..."
998,998,1,0.633,"[(1, 0.6330109), (0, 0.23337057), (5, 0.033460..."
