# English Data Topic Modeling Using `LDA`

## Modeul Import

In [None]:
# self defined Modules
from myModules.utils.merge.mergeOverPeriod import merge
from myModules.TopicModeling.LDA.ldaModeling import buildDTM, topicWords, visualizeLDA

# General Modules
import pandas as pd
import numpy as np
import pickle
from itertools import product

import warnings
from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

# NLP
from gensim import models
from gensim.models.coherencemodel import CoherenceModel

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline

## Data Load

In [None]:
DATA_ROOT = './processed-data/'

PERIOD_1 = DATA_ROOT + 'period-1/'
PERIOD_2 = DATA_ROOT + 'period-2/'
PERIOD_3 = DATA_ROOT + 'period-3/'

RESULT_ROOT = './Result/3구간/'

RESULT_1 = RESULT_ROOT + '/1시기/ST/'
RESULT_2 = RESULT_ROOT + '/2시기/ST/'
RESULT_3 = RESULT_ROOT + '/3시기/ST/'

In [None]:
with open(PERIOD_1+"lemmatized-all.pkl", "rb") as f:
    all_1 = pickle.load(f)
with open(PERIOD_1+"lemmatized-noun.pkl", "rb") as f:
    noun_1 = pickle.load(f)
with open(PERIOD_1+"lemmatized-verb.pkl", "rb") as f:
    verb_1 = pickle.load(f)
with open(PERIOD_1+"lemmatized-adjective.pkl", "rb") as f:
    adjective_1 = pickle.load(f)
with open(PERIOD_1+"lemmatized-adverb.pkl", "rb") as f:
    adverb_1 = pickle.load(f)


with open(PERIOD_2+"lemmatized-all.pkl", "rb") as f:
    all_2 = pickle.load(f)
with open(PERIOD_2+"lemmatized-noun.pkl", "rb") as f:
    noun_2 = pickle.load(f)
with open(PERIOD_2+"lemmatized-verb.pkl", "rb") as f:
    verb_2 = pickle.load(f)
with open(PERIOD_2+"lemmatized-adjective.pkl", "rb") as f:
    adjective_2 = pickle.load(f)
with open(PERIOD_2+"lemmatized-adverb.pkl", "rb") as f:
    adverb_2 = pickle.load(f)

with open(PERIOD_3+"lemmatized-all.pkl", "rb") as f:
    all_3 = pickle.load(f)
with open(PERIOD_3+"lemmatized-noun.pkl", "rb") as f:
    noun_3 = pickle.load(f)
with open(PERIOD_3+"lemmatized-verb.pkl", "rb") as f:
    verb_3 = pickle.load(f)
with open(PERIOD_3+"lemmatized-adjective.pkl", "rb") as f:
    adjective_3 = pickle.load(f)
with open(PERIOD_3+"lemmatized-adverb.pkl", "rb") as f:
    adverb_3 = pickle.load(f)

## Topic Modeling

- topic num : 가설로 설정한 topic의 갯수
    1. Topic Coherence
        - 주제의 일관성 측정
        - 모델링이 잘 될수록 한 주제 안에는 의미론적으로 유사한 단어가 많이 모여있게 됨.
        - 높을수록 의미론적 일관성이 높다.
        - Coherence가 높아지면 Monotonic 해지는 문제점이 생긴다.
        - coherence가 너무 높아지면 정보의 양이 줄어들고, coherence가 너무 낮으면 정보들의 연관성이 없어져 분석의 의미가 없다.
    2. Perplexity   
        - Coherence가 이 data에서 topic number가 늘어날수록 거의 같이 늘어나는 경향을 보임
        - 따라서 다른 평가기준도 함께 고려해야겠다는 생각에 추가
        - 작아질수록 토픽모델이 문서를 잘 반영한다.

- lda modeling 결과를 시각화해 보았을 때, 10이상으로 넘어가면 할당되지 않는 빈 id들이 발견되었습니다.
    - 따라서 시험할 k값의 범위를 1~15까지 자연수로 설정하였습니다.

In [None]:
NUM_TOPIC_WORDS = 30
random_state = 42

In [None]:
class BestLDAPram:
    def __init__(self, data, random_state=42):
        self.data = data
        self.random_state = random_state
        self.corpus, self.dictionary = buildDTM(self.data)
    
    def grid_search(self, param_grid, result_save_root='./'):
        grid_search_table = []

        iterator = list(product(param_grid['num_topics'], param_grid['alpha'], param_grid['eta']))

        for num_topic, alpha, eta in tqdm(iterator, desc="LDA Parameter Grid Searching"):
            coherence_value = self.calc_coherence(num_topic=num_topic, alpha=alpha, eta=eta)
            grid_search_table.append([num_topic, alpha, eta, coherence_value])
        
        self.grid_search_table = pd.DataFrame(grid_search_table)
        self.grid_search_table.columns = ['Topics', 'Alpha', 'Eta', 'Coherence']

        self.grid_search_table.to_csv(result_save_root+'lda-param-grid-search.csv', index=False)
    
    def calc_coherence(self, num_topic, alpha, eta):
        model = models.LdaMulticore(corpus=self.corpus, id2word=self.dictionary, num_topics=num_topic, \
            alpha=alpha, eta=eta, random_state=self.random_state)
        
        coherence_model = CoherenceModel(model=model, texts=self.data, dictionary=self.dictionary, coherence='c_v')
        
        return coherence_model.get_coherence()
    
    def load_grid_search_result(self, grid_search_result):
        self.grid_search_table = grid_search_result
    
    def get_best_params(self):
        num_topics = self.grid_search_table.Topics.unique()
        coherences = []
        for ntopic in num_topics:
            coherences.append(self.grid_search_table.Coherence[self.grid_search_table.Topics == ntopic].mean())
        
        idx = np.argmax(coherences)
        best_num_topics = num_topics[idx]

        alphas = self.grid_search_table.Alpha.unique()
        coherences = []
        for alpha in alphas:
            coherences.append(self.grid_search_table.Coherence[self.grid_search_table.Alpha == alpha].mean())
        
        idx = np.argmax(coherences)
        best_alpha = alphas[idx]
        
        etas = self.grid_search_table.Eta.unique()
        coherences = []
        for eta in etas:
            coherences.append(self.grid_search_table.Coherence[self.grid_search_table.Eta == eta].mean())
        
        idx = np.argmax(coherences)
        best_eta = etas[idx]

        return best_num_topics, best_alpha, best_eta
    
    def plot_coherence_per_topics(self, title='Coherence per Topic Num', root='./'):
        plt.figure()

        num_topics = self.grid_search_table.Topics.unique()
        coherences = []
        for ntopic in num_topics:
            coherences.append(self.grid_search_table.Coherence[self.grid_search_table.Topics == ntopic].mean())

        plt.plot(num_topics, coherences)

        plt.xlabel('Number of Topics')
        plt.ylabel('Coherence')

        plt.title(title)
        plt.savefig(root+title+'.png')
        plt.show()

    def plot_coherence_per_alpha(self, title='Coherence per alpha', root='./'):
        plt.figure()

        alphas = self.grid_search_table.Alpha.unique()
        coherences = []
        for alpha in alphas:
            coherences.append(self.grid_search_table.Coherence[self.grid_search_table.Alpha == alpha].mean())

        plt.plot(alphas, coherences)

        plt.xlabel('Alpha')
        plt.ylabel('Coherence')

        plt.title(title)
        plt.savefig(root+title+'.png')
        plt.show()
    
    def plot_coherence_per_eta(self, title='Coherence per eta', root='./'):
        plt.figure()

        etas = self.grid_search_table.Eta.unique()
        coherences = []
        for eta in etas:
            coherences.append(self.grid_search_table.Coherence[self.grid_search_table.Eta == eta].mean())

        plt.plot(etas, coherences)

        plt.xlabel('Eta')
        plt.ylabel('Coherence')

        plt.title(title)
        plt.savefig(root+title+'.png')
        plt.show()

#### Period 1

In [None]:
lda_param = BestLDAPram(data=all_1, random_state=random_state)

##### Prameter의 실험 범위를 설정하고 모든 조합에 대해 Coherence를 계산하여 parameter tuning

- 최초 1회만 실행 -> 약 3시간 정도 걸림.
- grid search 결과를 csv 파일로 저장 -> 불러와서 사용하기

In [None]:
num_topics = range(2, 11)
alpha = list(np.arange(0.01, 1, 0.3))
alpha.extend(['symmetric', 'asymmetric'])
eta = list(np.arange(0.01, 1, 0.3))
eta.append('symmetric')

param_grid = {
    'num_topics' : num_topics,
    'alpha' : alpha,
    'eta' : eta
}

In [None]:
lda_param.grid_search(param_grid=param_grid)

In [None]:
grid_search_table = pd.read_csv('./lda-param-grid-search.csv')

lda_param.load_grid_search_result(grid_search_result=grid_search_table)

In [None]:
NUM_TOPICS, ALPHA, ETA = lda_param.get_best_params()

print(f"best Number of Topics : {NUM_TOPICS}\nbest Alpha : {ALPHA}\nbest Eta : {ETA}")

In [None]:
lda_param.plot_coherence_per_topics()

In [None]:
lda_param.plot_coherence_per_alpha()

In [None]:
lda_param.plot_coherence_per_eta()

In [None]:
Corp, Dict = buildDTM(all_1)
model = models.ldamodel.LdaModel(corpus=Corp, id2word=Dict, num_topics=NUM_TOPICS, \
            passes=PASSES, alpha=ALPHA, eta=ETA, random_state=random_state)
topicdf = topicWords(model, NUM_TOPIC_WORDS)
topicdf.to_csv(RESULT_1+'[Period 1] topic words.csv', index=False)
data = visualizeLDA(model, Corp, Dict)

data

#### Period 2

In [None]:
lda_param = BestLDAPram(data=all_2, passes=PASSES, random_state=random_state)

##### Prameter의 실험 범위를 설정하고 모든 조합에 대해 Coherence를 계산하여 parameter tuning

- 최초 1회만 실행 -> 약 3시간 정도 걸림.
- grid search 결과를 csv 파일로 저장 -> 불러와서 사용하기

In [None]:
param_grid = {
    'num_topics' : range(2, 11),
    'alpha' : [i * 0.001 for i in range(1, 11)],
    'eta' : [i * 0.01 for i in range(1, 11)]
}

lda_param.grid_search(param_grid=param_grid)

In [None]:
NUM_TOPICS, ALPHA, ETA = lda_param.get_best_params()

print(f"best Number of Topics : {NUM_TOPICS}\nbest Alpha : {ALPHA}\nbest Eta : {ETA}")

In [None]:
lda_param.plot_coherence_per_topics()

In [None]:
lda_param.plot_coherence_per_alpha()

In [None]:
lda_param.plot_coherence_per_eta()

In [None]:
Corp, Dict = buildDTM(all_2)
model = models.ldamodel.LdaModel(corpus=Corp, id2word=Dict, num_topics=NUM_TOPICS, \
            passes=PASSES, alpha=ALPHA, eta=ETA, random_state=random_state)
topicdf = topicWords(model, NUM_TOPIC_WORDS)
topicdf.to_csv(RESULT_2+'[Period 2] topic words.csv', index=False)
data = visualizeLDA(model, Corp, Dict)

data

#### Period 3

In [None]:
lda_param = BestLDAPram(data=all_3, passes=PASSES, random_state=random_state)

##### Prameter의 실험 범위를 설정하고 모든 조합에 대해 Coherence를 계산하여 parameter tuning

- 최초 1회만 실행 -> 약 3시간 정도 걸림.
- grid search 결과를 csv 파일로 저장 -> 불러와서 사용하기

In [None]:
param_grid = {
    'num_topics' : range(2, 11),
    'alpha' : [i * 0.001 for i in range(1, 11)],
    'eta' : [i * 0.01 for i in range(1, 11)]
}

lda_param.grid_search(param_grid=param_grid)

In [None]:
NUM_TOPICS, ALPHA, ETA = lda_param.get_best_params()

print(f"best Number of Topics : {NUM_TOPICS}\nbest Alpha : {ALPHA}\nbest Eta : {ETA}")

In [None]:
lda_param.plot_coherence_per_topics()

In [None]:
lda_param.plot_coherence_per_alpha()

In [None]:
lda_param.plot_coherence_per_eta()

In [None]:
Corp, Dict = buildDTM(all_3)
model = models.ldamodel.LdaModel(corpus=Corp, id2word=Dict, num_topics=NUM_TOPICS, \
            passes=PASSES, alpha=ALPHA, eta=ETA, random_state=random_state)
topicdf = topicWords(model, NUM_TOPIC_WORDS)
topicdf.to_csv(RESULT_3+'[Period 3] topic words.csv', index=False)
data = visualizeLDA(model, Corp, Dict)

data