# 1. 필요한 라이브러리 다운로드

In [3]:
import pandas as pd
import re
from kiwipiepy import Kiwi
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from tqdm import tqdm

# 2. 데이터셋 준비

In [None]:
name_file = ['LeeSeungMan', 'YoonBoSun', 'ParkJeongHee', 'ChoiGyuHa', 'JeonDuHwuan',
             'RohTaeWoo', 'KimYoungSam', 'KimDaeJung', 'RohMooHyun', 'LeeMyungBak',
             'ParkGeunHye', 'MoonJaeIn', 'YoonSeokYeol']

# 3. LDA

In [None]:
# 텍스트 정제 함수 정의
def clean_text(text):
    text = text.replace(".", "").strip()
    text = text.replace("·", " ").strip()
    text = text.replace("\n", "").strip()
    pattern = '[^ ㄱ-ㅣ가-힣|0-9]+'
    text = re.sub(pattern=pattern, repl='', string=text)
    return text

# 명사 추출 함수
def get_nouns(tokenizer, sentence, stopwords):
    tokens = tokenizer.analyze(sentence)[0][0]
    nouns = [word.form for word in tokens if word.tag in ['NNG', 'NNP'] and len(word.form) > 1]
    filtered_nouns = [noun for noun in nouns if noun not in stopwords]
    return filtered_nouns

In [None]:
# DataFrame 준비
for name in name_file:
  file_path = f'./dataset/cleaned_data/cleaned_{name}.csv'
  df = pd.read_csv(file_path)
  if name_file == 'YoonSeokYeol':
    df.colums = ["index", "title", "subtitle", "date", "speaker", "speech"]
  else:
    df.columns = ["index", "title", "type", "date", "speaker", "speech"]
  df = df.dropna(how='any')

  # Kiwi 토크나이저 초기화
  tokenizer = Kiwi()

  stopwords = ['국민', '정부', '나라', '대하다', '발전', '국가', '한국', '대통령', '노력', '오늘', '지금', '지금', '문제',
               '관계', '더욱', '감사', '시대', '통하다', '모두', '지역', '동시', '이때', '이날', '하오']

  # 연설문 토큰화
  df['cleaned_speech'] = df['speech'].apply(lambda x: clean_text(x))
  df['tokens'] = df['cleaned_speech'].apply(lambda x: get_nouns(tokenizer, x, stopwords))

  # 토큰화 결과 확인
  print(df[['cleaned_speech', 'tokens']].head())

  # 토픽 모델링 준비
  dictionary = corpora.Dictionary(df['tokens'])
  corpus = [dictionary.doc2bow(text) for text in df['tokens']]

  # 사전과 코퍼스 내용 확인
  print(dictionary)
  print(corpus[:5])

  # LDA 모델 학습
  lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

  # 토픽 시각화
  vis = gensimvis.prepare(lda_model, corpus, dictionary)

  # pyLDAvis 시각화 저장
  pyLDAvis.save_html(vis, f'./lda/result/each_result/{name}_kiwi_lda.html')

  # 결과 출력
  for idx, topic in lda_model.print_topics(-1):
      print(f'Topic: {idx} \nWords: {topic}\n')