In [20]:
import pandas as pd
import urllib.request
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package punkt to /home/hyejin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hyejin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/hyejin/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [None]:
# kaggle에서 약 15년간 발행되었던 뉴스 기사 제목을 모아놓은 영어 데이터 다운받기

# urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/21.%20Topic%20Modeling/dataset/abcnews-date-text.csv", filename="abcnews-date-text.csv")
data = pd.read_csv('../../datasets/abcnews-date-text.csv', error_bad_lines=False)
print('뉴스 제목 개수 :', len(data))

In [6]:
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [12]:
text = data[['headline_text']]    # dataframe의 일부를 떼어낼 때는 [[]] 이렇게 써 줌
text.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [15]:
# 전처리 (불용어 제거, 표제어 추출, 길이가 짧은 단어 제거)

# 단어 토큰화
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,headline_text
0,"[aba, decides, against, community, broadcastin..."
1,"[act, fire, witnesses, must, be, aware, of, de..."
2,"[a, g, calls, for, infrastructure, protection,..."
3,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,"[air, nz, strike, to, affect, australian, trav..."


In [16]:
# 불용어 제거
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])
text.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [21]:
# 표제어 추출
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
# pos : 해당 단어가 갖는 품사
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,headline_text
0,"[aba, decide, community, broadcast, licence]"
1,"[act, fire, witness, must, aware, defamation]"
2,"[g, call, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [24]:
# 길이가 짧은 (3 이하) 단어 제거
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 3])
print(tokenized_doc[:5])

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [26]:
# TF-IDF 행렬 만들기

# 역토큰화
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

text['headline_text'] = detokenized_doc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [27]:
text['headline_text'][:5]

0       decide community broadcast licence
1       fire witness must aware defamation
2    call infrastructure protection summit
3                   staff aust strike rise
4      strike affect australian travellers
Name: headline_text, dtype: object

In [28]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(text['headline_text'])

print('TF-IDF 행렬의 크기 :', X.shape)

TF-IDF 행렬의 크기 : (1226258, 1000)


In [29]:
# 주제는 10개라고 가정
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=42, max_iter=1)

In [30]:
lda_top = lda_model.fit_transform(X)

In [31]:
print(lda_model.components_)    # 행 : 각 topic, 열 : vectorizer에서 생성한 단어 집합 순서대로 단어가 차지하는 비중
print(lda_model.components_.shape)

[[1.00001333e-01 1.00000360e-01 1.00001033e-01 ... 1.00005409e-01
  1.00005312e-01 5.79474576e+02]
 [1.00002113e-01 1.00000151e-01 1.00000991e-01 ... 1.00005603e-01
  1.00003021e-01 1.00003672e-01]
 [1.07423397e+02 1.00001248e-01 1.00001029e-01 ... 1.00004519e-01
  1.00002302e-01 1.00004720e-01]
 ...
 [1.00010119e-01 1.00000715e-01 1.00001260e-01 ... 1.00005940e-01
  1.00003374e-01 1.00006855e-01]
 [1.00000459e-01 1.00000156e-01 1.00000800e-01 ... 2.66967658e+03
  2.25356733e+02 1.00003216e-01]
 [1.00001012e-01 1.00000749e-01 1.00000754e-01 ... 1.00006357e-01
  1.00003723e-01 1.00004739e-01]]
(10, 1000)


In [32]:
terms = vectorizer.get_feature_names()    # 단어 집합

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print(f"Topic {idx+1}", [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n-1:-1]])
        # feature_names[i] : i번째 단어, topic[i] : 각 topic에서 i번째 단어가 차지하는 분포 비율 (오름차순 출력)

In [33]:
get_topics(lda_model.components_, terms)

Topic 1 [('victoria', 10827.28), ('donald', 9114.15), ('canberra', 6155.99), ('attack', 5827.72), ('market', 5312.35)]
Topic 2 [('trump', 15903.94), ('queensland', 12908.75), ('change', 7262.83), ('crash', 6153.23), ('state', 6086.17)]
Topic 3 [('police', 13931.38), ('case', 10138.89), ('charge', 8386.79), ('test', 7241.0), ('murder', 7177.55)]
Topic 4 [('coronavirus', 38635.73), ('covid', 19570.11), ('melbourne', 8900.08), ('warn', 5345.1), ('speak', 4845.83)]
Topic 5 [('sydney', 10948.96), ('government', 9187.9), ('news', 8582.48), ('home', 7318.39), ('health', 6349.63)]
Topic 6 [('world', 6872.76), ('restrictions', 5961.35), ('face', 5695.77), ('bushfire', 5364.12), ('return', 4321.35)]
Topic 7 [('election', 9987.75), ('kill', 6939.16), ('make', 6108.33), ('adelaide', 6099.67), ('woman', 5921.79)]
Topic 8 [('australian', 13286.11), ('coronavirus', 9535.08), ('china', 8358.46), ('live', 7908.02), ('border', 6378.89)]
Topic 9 [('australia', 19355.9), ('court', 6940.48), ('record', 638