In [4]:
import pandas as pd
import urllib.request
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/19.%20Topic%20Modeling%20(LDA%2C%20BERT-Based)/dataset/abcnews-date-text.csv")

data = pd.read_csv('abcnews-date-text.csv')
print('뉴스 제목 개수 :',len(data))

뉴스 제목 개수 : 1244184


In [5]:
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [6]:
text = data[['headline_text']]
text.head(5)

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [7]:
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)
text.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)


Unnamed: 0,headline_text
0,"[aba, decides, against, community, broadcastin..."
1,"[act, fire, witnesses, must, be, aware, of, de..."
2,"[a, g, calls, for, infrastructure, protection,..."
3,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,"[air, nz, strike, to, affect, australian, trav..."


In [9]:
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])
text.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])


Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [10]:
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
text.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])


Unnamed: 0,headline_text
0,"[aba, decide, community, broadcast, licence]"
1,"[act, fire, witness, must, aware, defamation]"
2,"[g, call, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [11]:
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 2])
tokenized_doc[:5]

0     [aba, decide, community, broadcast, licence]
1    [act, fire, witness, must, aware, defamation]
2       [call, infrastructure, protection, summit]
3            [air, staff, aust, strike, pay, rise]
4    [air, strike, affect, australian, travellers]
Name: headline_text, dtype: object

In [14]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

# 다시 text['headline_text']에 재저장
text['headline_text'] = detokenized_doc
text.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = detokenized_doc


Unnamed: 0,headline_text
0,aba decide community broadcast licence
1,act fire witness must aware defamation
2,call infrastructure protection summit
3,air staff aust strike pay rise
4,air strike affect australian travellers


In [16]:
# 상위 1,000개의 단어를 보존 // 모든 단어 보존
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(text['headline_text'])

# TF-IDF 행렬의 크기 확인
print('TF-IDF 행렬의 크기 :',X.shape)

TF-IDF 행렬의 크기 : (1244184, 94955)


토픽 모델링

In [20]:
lda_model = LatentDirichletAllocation(n_components=6,learning_method='online',random_state=777,max_iter=1)

In [21]:
lda_top = lda_model.fit_transform(X)

In [22]:
print(lda_model.components_)
print(lda_model.components_.shape) 

[[1.66666720e-01 1.68781184e-01 1.75770653e-01 ... 1.66722329e-01
  1.66711925e-01 1.66666668e-01]
 [1.66672730e-01 1.68498191e-01 1.77092165e-01 ... 1.90003822e-01
  1.66711686e-01 1.66666668e-01]
 [1.66666722e-01 1.69348102e-01 4.81009586e+00 ... 1.66726940e-01
  1.66714590e-01 1.66666667e-01]
 [1.66666735e-01 1.69410768e-01 2.96187285e+00 ... 1.76808532e-01
  1.73867305e-01 1.66666667e-01]
 [1.66693316e-01 1.69051469e-01 1.76663435e-01 ... 1.66727677e-01
  1.88237689e-01 1.66666667e-01]
 [1.66673534e-01 2.56367533e+02 1.70791276e+00 ... 1.74533971e-01
  1.66706993e-01 1.66666667e-01]]
(6, 94955)


In [24]:
# 단어 집합. 1,000개의 단어가 저장됨.
terms = vectorizer.get_feature_names_out()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(lda_model.components_,terms)

Topic 1: [('coronavirus', 13055.12), ('covid', 7249.17), ('government', 6497.6), ('donald', 5654.46), ('election', 5645.96)]
Topic 2: [('case', 9071.28), ('new', 6145.35), ('vaccine', 5492.95), ('change', 4745.06), ('record', 4742.01)]
Topic 3: [('trump', 9321.61), ('restrictions', 4979.47), ('south', 4821.97), ('house', 4191.23), ('brisbane', 3640.0)]
Topic 4: [('australia', 12836.22), ('covid', 12513.91), ('coronavirus', 10708.27), ('victoria', 8377.68), ('people', 4215.43)]
Topic 5: [('australian', 8000.12), ('news', 5894.53), ('lockdown', 4891.27), ('coronavirus', 4825.71), ('state', 4125.78)]
Topic 6: [('queensland', 9068.35), ('police', 8490.0), ('man', 5431.42), ('live', 5189.55), ('charge', 4577.36)]


In [25]:
tokenized_doc[:5]

0     [aba, decide, community, broadcast, licence]
1    [act, fire, witness, must, aware, defamation]
2       [call, infrastructure, protection, summit]
3            [air, staff, aust, strike, pay, rise]
4    [air, strike, affect, australian, travellers]
Name: headline_text, dtype: object

In [27]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[0]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


In [112]:
dictionary[5], len(dictionary)

('act', 96686)

In [37]:
import gensim
NUM_TOPICS = 6 # 20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.015*"government" + 0.012*"health" + 0.011*"change" + 0.011*"court" + 0.011*"restrictions" + 0.011*"charge" + 0.010*"nsw" + 0.008*"help" + 0.007*"age" + 0.007*"call"')
(1, '0.044*"australia" + 0.017*"new" + 0.015*"news" + 0.014*"record" + 0.013*"day" + 0.012*"one" + 0.009*"world" + 0.008*"win" + 0.008*"announce" + 0.008*"protest"')
(2, '0.027*"coronavirus" + 0.020*"victoria" + 0.019*"covid" + 0.013*"vaccine" + 0.011*"live" + 0.011*"say" + 0.011*"nsw" + 0.010*"lockdown" + 0.010*"new" + 0.009*"minister"')
(3, '0.028*"queensland" + 0.026*"australian" + 0.017*"covid" + 0.016*"election" + 0.015*"border" + 0.014*"coronavirus" + 0.013*"open" + 0.012*"test" + 0.010*"morrison" + 0.009*"return"')
(4, '0.026*"case" + 0.025*"police" + 0.020*"trump" + 0.017*"man" + 0.015*"coronavirus" + 0.015*"covid" + 0.014*"sydney" + 0.014*"find" + 0.012*"china" + 0.011*"donald"')
(5, '0.017*"melbourne" + 0.016*"coronavirus" + 0.014*"house" + 0.013*"two" + 0.012*"people" + 0.010*"quarantine" + 0.010*"nation

In [39]:
ldamodel.print_topics()

[(0,
  '0.015*"government" + 0.012*"health" + 0.011*"change" + 0.011*"court" + 0.011*"restrictions" + 0.011*"charge" + 0.010*"nsw" + 0.008*"help" + 0.007*"age" + 0.007*"call"'),
 (1,
  '0.044*"australia" + 0.017*"new" + 0.015*"news" + 0.014*"record" + 0.013*"day" + 0.012*"one" + 0.009*"world" + 0.008*"win" + 0.008*"announce" + 0.008*"protest"'),
 (2,
  '0.027*"coronavirus" + 0.020*"victoria" + 0.019*"covid" + 0.013*"vaccine" + 0.011*"live" + 0.011*"say" + 0.011*"nsw" + 0.010*"lockdown" + 0.010*"new" + 0.009*"minister"'),
 (3,
  '0.028*"queensland" + 0.026*"australian" + 0.017*"covid" + 0.016*"election" + 0.015*"border" + 0.014*"coronavirus" + 0.013*"open" + 0.012*"test" + 0.010*"morrison" + 0.009*"return"'),
 (4,
  '0.026*"case" + 0.025*"police" + 0.020*"trump" + 0.017*"man" + 0.015*"coronavirus" + 0.015*"covid" + 0.014*"sydney" + 0.014*"find" + 0.012*"china" + 0.011*"donald"'),
 (5,
  '0.017*"melbourne" + 0.016*"coronavirus" + 0.014*"house" + 0.013*"two" + 0.012*"people" + 0.010*"quar

In [None]:
# pip install pyLDAvis

In [43]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

In [126]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)
    # doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
    doc = sorted(topic_list, key=lambda x: (x[1]), reverse=True)
    print(doc)
    

0 번째 문서의 topic 비율은 [(0, 0.20515978), (1, 0.029301945), (2, 0.1504378), (3, 0.2050645), (4, 0.38073403), (5, 0.029301958)]
1 번째 문서의 topic 비율은 [(0, 0.024015067), (1, 0.023825083), (2, 0.5474377), (3, 0.023846032), (4, 0.19234931), (5, 0.18852675)]
2 번째 문서의 topic 비율은 [(0, 0.033618893), (1, 0.033342775), (2, 0.6329594), (3, 0.033342775), (4, 0.033466924), (5, 0.23326924)]
3 번째 문서의 topic 비율은 [(0, 0.5958986), (1, 0.023837658), (2, 0.023837658), (3, 0.023837658), (4, 0.023837658), (5, 0.30875078)]
4 번째 문서의 topic 비율은 [(0, 0.36111572), (1, 0.027782893), (2, 0.19439459), (3, 0.36114103), (4, 0.027782893), (5, 0.027782893)]


In [182]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        if i == 1000:
            break
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = pd.concat([topic_table, pd.DataFrame([int(topic_num), round(prop_topic,4), doc]).T], ignore_index= True)
                # pd.DataFrame({'가장 비중이 높은 토픽' : int(topic_num), '가장 높은 토픽의 비중' : round(prop_topic,4), '각 토픽의 비중':topic_list})
                # print(topic_table)
                # topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return topic_table

In [163]:
df = pd.DataFrame()
sr1 = pd.DataFrame(['e0','e1','e2'])
sr2 = pd.DataFrame(['f0','f1','f2'])
sr3 = pd.DataFrame(['g0','g1','g2'])
A = pd.concat([df, sr1.T])
A = pd.concat([A, sr2.T])
pd.concat([A, sr3.T])

Unnamed: 0,0,1,2
0,e0,e1,e2
0,f0,f1,f2
0,g0,g1,g2


In [194]:
topictable = make_topictable_per_doc(ldamodel, corpus)

In [195]:
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['num', 'best_topic', 'best_topic_rate', 'topic_rate']

In [196]:
topictable

Unnamed: 0,num,best_topic,best_topic_rate,topic_rate
0,0,4,0.3807,"[(4, 0.38066486), (0, 0.20512268), (3, 0.20502..."
1,1,2,0.5474,"[(2, 0.54737), (4, 0.19238877), (5, 0.18855181..."
2,2,2,0.633,"[(2, 0.6329586), (5, 0.23326924), (0, 0.033619..."
3,3,0,0.5959,"[(0, 0.5958986), (5, 0.30875078), (1, 0.023837..."
4,4,3,0.3611,"[(3, 0.361141), (0, 0.36111572), (2, 0.1943945..."
...,...,...,...,...
995,995,0,0.8332,"[(0, 0.8331586), (2, 0.03341528), (1, 0.033356..."
996,996,5,0.5416,"[(5, 0.54160905), (0, 0.29168916), (1, 0.04167..."
997,997,1,0.5308,"[(1, 0.5307774), (0, 0.35695925), (3, 0.028065..."
998,998,5,0.6331,"[(5, 0.6331105), (3, 0.23347138), (0, 0.033354..."
