# English Data Topic Modeling Using `LDA`

## Modeul Import

In [23]:
# self defined Modules
from myModules.utils.data.DataLoader import DataLoader
from myModules.utils.merge.mergeOverPeriod import merge
from myModules.preprocess import cleaning, tagging, removeStopWords_ST, tokenizing_ST, extract_some_pos_ST, keras_tokenizer
from myModules.TopicModeling.LDA.ldaModeling import buildDTM, topicWords, visualizeLDA

# General Modules
import pandas as pd
import numpy as np
import re

import warnings
from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

# Read File
import glob

# NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import models
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline

## Data Load

In [2]:
DATA_ROOT = './Data/3구간/'

PERIOD_1 = DATA_ROOT + '1시기/1시기_ST/'
PERIOD_2 = DATA_ROOT + '2시기/2시기_ST/'
PERIOD_3 = DATA_ROOT + '3시기/3시기_ST/'

RESULT_ROOT = './Result/3구간/'

RESULT_1 = RESULT_ROOT + '/1시기/ST/'
RESULT_2 = RESULT_ROOT + '/2시기/ST/'
RESULT_3 = RESULT_ROOT + '/3시기/ST/'

In [3]:
files_1 = glob.glob(PERIOD_1+'*.txt')
files_2 = glob.glob(PERIOD_2+'*.txt')
files_3 = glob.glob(PERIOD_3+'*.txt')

texts_1 = DataLoader(files_1, mode='ST')
texts_2 = DataLoader(files_2, mode='ST')
texts_3 = DataLoader(files_3, mode='ST')

## PreProcess

### 3-1. Data Cleaning

- nltk.tokenize의 word_tokenize 는 's를 자동으로 분류해줌
- 탐색 결과 어퍼스트로피로 끝나는 단어와 어퍼스트로피로 시작하는 단어도 존재
- 따라서 이렇게 어퍼스트로피로 시작하거나 끝나는 단어는 특수문자를 제거
- 's 는 삭제하지 않음
- "'"와 "."는 cleaning시 제거하지 않고, 나머지 특수문자는 공백으로 제거
    - "."는 u.s 나 u.s.s.r 과 같은 약자를 구분하기 위해 제거하지 않음
    - nltk.tokenize의 word_tokenize 가 u.s. 를 u.s와 .로 구분해주기에 us와 u.s.를 구분할 수 있을 것이라고 기대할 수 있음.

In [4]:
cleaned_1 = cleaning(texts_1, mode='ST')
cleaned_2 = cleaning(texts_2, mode='ST')
cleaned_3 = cleaning(texts_3, mode='ST')

### 3-1. tokenizing

- nltk.tokenize의 word_tokenized를 이용하여 tokenizing 수행
- 어퍼스트로피와 마침표를 제거하지 않고 넣어줬으므로, 소유격등 특수한 어휘의 특성을 유지하기위해 어퍼스트로피를 유지할 token을 설정
- 어퍼스트로피를 유지할 Token외에는 모든 특수문자를 제거
    - 이때 u.s를 위해 .는 유지

In [5]:
def test_apostrophe(data):
    startwith = []
    endwith = []

    for tokenized in data:
        for token in tokenized:
            if token.endswith("'"): endwith.append(token)
            if token.startswith("'"): startwith.append(token)

    set_start = set(startwith)
    set_end = set(endwith)

    print(f"Tokenize 이후 ' 로 시작하는 token : \n{set_start}")
    print(f"Tokneize 이후 ' 로 끝나는 token : \n{set_end}")

def remove_apostrophe(data, exception):
    result = []

    for tokenized in data:
        tokens = []
        for token in tokenized:
            if token not in exception:
                tokens.append(re.sub("[^a-z]", "", token))
            else : tokens.append(token)
        result.append(tokens)
    
    return result

def remove_special(data, exception):
    not_alnum = []
    result = []

    for tokens in data:
        for token in tokens:
            if not token.isalnum() : not_alnum.append(token)
    
    remove_list = set(not_alnum)
    exception = set(exception)
    remove_list = remove_list.difference(exception)

    print(f"Removed : {remove_list}")

    for article in data:
        tokens = []
        for token in article:
            if token not in remove_list:
                tokens.append(token)
        result.append(tokens)
    
    return result

In [6]:
tokenized_1 = [word_tokenize(text) for text in cleaned_1]
tokenized_2 = [word_tokenize(text) for text in cleaned_2]
tokenized_3 = [word_tokenize(text) for text in cleaned_3]

#### Period 1

In [7]:
test_apostrophe(tokenized_1)

Tokenize 이후 ' 로 시작하는 token : 
{"'s", "'d", "'german", "'m", "'blamed", "'ve", "'into", "'structure", "'madam", "'are", "'heat", "'democracy", "'system", "'mvd", "'ll", "'liberty", "'"}
Tokneize 이후 ' 로 끝나는 token : 
{"'"}


`'ll`, `'d`, `'s`, `'ve` 가 아니면 `'`를 모두 제거

In [8]:
exception = ["'ll", "'d", "'s", "'ve"]

tokenized_1_ = remove_apostrophe(tokenized_1, exception)

In [9]:
test_apostrophe(tokenized_1_)

Tokenize 이후 ' 로 시작하는 token : 
{"'s", "'ll", "'ve", "'d"}
Tokneize 이후 ' 로 끝나는 token : 
set()


In [10]:
tokenized_1_ = remove_special(tokenized_1_, exception)

Removed : {''}


#### Period 2

In [11]:
test_apostrophe(tokenized_2)

Tokenize 이후 ' 로 시작하는 token : 
{"'s", "'m", "'reprisals", "'for", "'"}
Tokneize 이후 ' 로 끝나는 token : 
{"'"}


`'s`가 아니면 `'`를 모두 제거

In [12]:
exception = ["'s"]

tokenized_2_ = remove_apostrophe(tokenized_2, exception)

In [13]:
test_apostrophe(tokenized_2_)

Tokenize 이후 ' 로 시작하는 token : 
{"'s"}
Tokneize 이후 ' 로 끝나는 token : 
set()


In [14]:
tokenized_2_ = remove_special(tokenized_2_, exception)

Removed : {''}


#### Peiod 3

In [15]:
test_apostrophe(tokenized_1)

Tokenize 이후 ' 로 시작하는 token : 
{"'s", "'d", "'german", "'m", "'blamed", "'ve", "'into", "'structure", "'madam", "'are", "'heat", "'democracy", "'system", "'mvd", "'ll", "'liberty", "'"}
Tokneize 이후 ' 로 끝나는 token : 
{"'"}


`'d`, `'s`, `'ve`, `'ll`를 제외하고는 모두 `'` 제거

In [16]:
exception = ["'d", "'s", "'ve", "'ll"]

tokenized_3_ = remove_apostrophe(tokenized_3, exception)

In [17]:
test_apostrophe(tokenized_3_)

Tokenize 이후 ' 로 시작하는 token : 
{"'s", "'ll", "'ve", "'d"}
Tokneize 이후 ' 로 끝나는 token : 
set()


In [18]:
tokenized_3_ = remove_special(tokenized_3_, exception)

Removed : {''}


### 3-3. Remove StopWords

In [19]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = ['would', 'could', 'might', 'need', 'can', 'must', \
    'e', 'one', 'two', 'upon', 'may', 'perhaps', 'living', 'seem', 'also', 'ii', 'ofthe',
    'also', 'much', 'therefore', 'u', 's']

wo_stopword_1 = removeStopWords_ST(tokenized_1_, stopwords, new_stopwords)
wo_stopword_2 = removeStopWords_ST(tokenized_2_, stopwords, new_stopwords)
wo_stopword_3 = removeStopWords_ST(tokenized_3_, stopwords, new_stopwords)

### 3-4. Lemmatization

In [27]:
def lemmatizing(data, lemmatizer):
    result = []

    for article in data:
        result.append([lemmatizer.lemmatize(token) for token in article])
    
    return result

In [28]:
lemmatizer = WordNetLemmatizer()

lemmatized_1 = lemmatizing(wo_stopword_1, lemmatizer)
lemmatized_2 = lemmatizing(wo_stopword_2, lemmatizer)
lemmatized_3 = lemmatizing(wo_stopword_3, lemmatizer)

### 3-5. Tagging

In [32]:
tagList = [['noun', ['NN','NNS','NNP','NNPS'], ['NNG','NNB','NNP','NNM']], \
    ['pronoun', ['PRP','WP','PRP'], ['NP']],
    ['verb', ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'], ['VV', 'VXV', 'VCP']],
    ['adjective', ['JJ', 'JJR', 'JJS'], ['VA', 'VXA', 'VCN']],
    ['adverb', ['RB', 'RBR', 'RBS', 'WRB', 'EX', 'RP'], ['MAG']],
    ['prep&conj', ['TO', 'IN', 'CC'], ['MAC']],
    ['determiner', ['DT', 'PDT', 'WDT'], ['MDT', 'MDN']],
    ['interjection',['UH'], ['IC']],
    ['number', ['CD'], ['NR', 'ON']],
    ['foreignW', ['FW'],['OL']],
    ['modal',['MD'],[]],
    ['josa', [], ['JC', 'JK', 'JKC', 'JKG', 'JKI', 'JKM', 'JKO', 'JKQ', 'JKS', 'JX']],
    ['possesiveS', ['POS'], []],
    ['others',['LS'], ['EPH', 'EPT', 'EPP', 'EFN', 'EFQ', 'EFO', 'EFA', 'EFI', 'EFR', 'ECE', 'ECD', 'ECS', 'ETN', 'ETD', 'XPN', 'XPV', 'XSN', 'XSV', 'XSA', 'XR', 'UN', 'OH']]]

tagList = pd.DataFrame(tagList)
tagList.columns = ['POS', 'Eng_tag', 'Kor_tag']

In [33]:
tagged_1 = tagging(lemmatized_1, mode='ST')
tagged_2 = tagging(lemmatized_2, mode='ST')
tagged_3 = tagging(lemmatized_3, mode='ST')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [None]:
extracted_1 = extract_some_pos_ST(articles=tagged_1, tagList=tagList, pos_list=['noun', 'verb', 'adjective'])
extracted_2 = extract_some_pos_ST(articles=tagged_2, tagList=tagList, pos_list=['noun', 'verb', 'adjective'])
extracted_3 = extract_some_pos_ST(articles=tagged_3, tagList=tagList, pos_list=['noun', 'verb', 'adjective'])

## 4. Topic Modeling

### 4-1. parameter tuning & LDA modeling

- topic num : 가설로 설정한 topic의 갯수
    1. Topic Coherence
        - 주제의 일관성 측정
        - 모델링이 잘 될수록 한 주제 안에는 의미론적으로 유사한 단어가 많이 모여있게 됨.
        - 높을수록 의미론적 일관성이 높다.
        - Coherence가 높아지면 Monotonic 해지는 문제점이 생긴다.
        - coherence가 너무 높아지면 정보의 양이 줄어들고, coherence가 너무 낮으면 정보들의 연관성이 없어져 분석의 의미가 없다.
    2. Perplexity   
        - Coherence가 이 data에서 topic number가 늘어날수록 거의 같이 늘어나는 경향을 보임
        - 따라서 다른 평가기준도 함께 고려해야겠다는 생각에 추가
        - 작아질수록 토픽모델이 문서를 잘 반영한다.

- lda modeling 결과를 시각화해 보았을 때, 10이상으로 넘어가면 할당되지 않는 빈 id들이 발견되었습니다.
    - 따라서 시험할 k값의 범위를 1~10까지 자연수로 설정하였습니다.


In [None]:
NUM_TOPIC_WORDS = 30
PASSES = 30
ITERATIONS = 400
EVAL_EVERY = None
random_state = 42

In [None]:
class BestLDAPram:
    def __init__(self, passes=30, iterations=400, eval_every=None, random_state=42):
        self.passes = passes
        self.iterations = iterations
        self.eval_every = eval_every
        self.random_state = random_state
    
    def calc_coherence(self, corpus, dictionary, alpha='auto', eta='auto'):
        self.coherences = []
        self.num_topics_c = []

        for ntopics in tqdm(range(1, 11), desc='Topic Coherence'):
            model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=ntopics, \
                iterations=self.iterations, passes=self.passes, \
                alpha=alpha, eta=eta, eval_every=self.eval_every, random_state=self.random_state)
            
            cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
            coherence = cm.get_coherence()

            self.coherences.append(coherence)
            self.num_topics_c.append(ntopics)
    
    def calc_perplexity(self, corpus, dictionary, alpha='auto', eta='auto'):
        self.perplexities = []
        self.num_topics_p = []

        for ntopics in tqdm(range(1, 11), desc='Perpelxity'):
            model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=ntopics, \
                iterations=self.iterations, passes=self.passes, \
                alpha=alpha, eta=eta, eval_every=self.eval_every, random_state=self.random_state)
            
            perplexity = model.log_perplexity(corpus)
            self.perplexities.append(perplexity)
            self.num_topics_p.append(ntopics)
    
    def calc_alpha(self, corpus, dictionary, num_topics, eta='auto'):
        self.alphas = []
        self.coherences_a = []

        for i in tqdm(range(0, 11), desc='alpha'):
            alpha = i * 0.001
            model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, \
                iterations=self.iterations, passes=self.passes, \
                alpha=alpha, eta=eta, eval_every=self.eval_every, random_state=self.random_state)

            cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
            coherence = cm.get_coherence()

            self.coherences_a.append(coherence)
            self.alphas.append(alpha)
    
    def calc_eta(self, corpus, dictionary, num_topics, alpha='auto'):
        self.etas = []
        self.coherences_e = []

        for i in tqdm(range(0, 11), desc='eta'):
            eta = i * 0.01
            model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, \
                iterations=self.iterations, passes=self.passes, \
                alpha=alpha, eta=eta, eval_every=self.eval_every, random_state=self.random_state)
        
            cm = CoherenceModel(model=model, corpus=corpus, coherence='c_v')
            coherence = cm.get_coherence()

            self.coherences_e.append(coherence)
            self.etas.append(eta)
    
    def plot_coherence(self, title='Coherence per Topic Num', root='./'):
        plt.figure()

        plt.plot(self.num_topics_c, self.coherences)

        plt.xlabel('Topic Number')
        plt.ylabel('Coherence')

        plt.title(title)
        plt.savefig(root+title+'.png')
        plt.show()
    
    def plot_perplexity(self, title='Perplexity per Topic Num', root='./'):
        plt.figure()

        plt.plot(self.num_topics_p, self.perplexities)

        plt.xlabel('Topic Number')
        plt.ylabel('Perplexity')

        plt.title(title)
        plt.savefig(root+title+'.png')
        plt.show()

    def plot_alpha(self, title='Coherence per alpha', root='./'):
        plt.figure()

        plt.plot(self.alphas, self.coherences_a)

        plt.xlabel('alpha')
        plt.ylabel('coherence')

        plt.title(title)
        plt.savefig(root+title+'.png')
        plt.show()
    
    def plot_eta(self, title='Coherence per eta', root='./'):
        plt.figure()

        plt.plot(self.etas, self.coherences_e)

        plt.xlabel('eta')
        plt.ylabel('coherence')

        plt.title(title)
        plt.savefig(root+title+'.png')
        plt.show()

#### Period 1

In [None]:
lda_param = BestLDAPram(passes=PASSES, iterations=ITERATIONS, eval_every=EVAL_EVERY)

In [None]:
Corp, Dict = buildDTM(wo_stopword_1)

##### Coherence와 Perplexity로 최적의 topic number 구하기

In [None]:
lda_param.calc_coherence(corpus=Corp, dictionary=Dict)
lda_param.calc_perplexity(corpus=Corp, dictionary=Dict)

lda_param.plot_coherence('[Period 1] Coherence per Topic Number', root=RESULT_1)
lda_param.plot_perplexity('[Period 1] Perplexity per Topic Number', root=RESULT_1)

##### 최적의 Topic number를 이용하여 최적의 alpha와 eta값 추정하기

In [None]:
NUM_TOPICS = 9

lda_param.calc_alpha(corpus=Corp, dictionary=Dict, num_topics=NUM_TOPICS)
lda_param.calc_eta(corpus=Corp, dictionary=Dict, num_topics=NUM_TOPICS)

lda_param.plot_alpha(title='[Period 1] Coherence per alpha', root=RESULT_1)
lda_param.plot_eta(title='[Period 1] Coherence per eta', root=RESULT_1)

In [None]:
NUM_TOPICS = 9
ALPHA = 0.002
ETA = 0.07

Corp, Dict = buildDTM(wo_stopword_1)
model = models.ldamodel.LdaModel(Corp, id2word=Dict, num_topics=NUM_TOPICS, alpha=ALPHA, eta=ETA, random_state=random_state)
topicdf = topicWords(model, NUM_TOPIC_WORDS)
topicdf.to_csv(RESULT_1+'[Period 1] topic words.csv', index=False)
data = visualizeLDA(model, Corp, Dict)

data

#### Period 2

In [None]:
lda_param = BestLDAPram(passes=PASSES, iterations=ITERATIONS, eval_every=EVAL_EVERY)

In [None]:
Corp, Dict = buildDTM(wo_stopword_2)

##### Coherence와 Perplexity로 최적의 topic number 구하기

In [None]:
lda_param.calc_coherence(corpus=Corp, dictionary=Dict)
lda_param.calc_perplexity(corpus=Corp, dictionary=Dict)

lda_param.plot_coherence('[Period 2] Coherence per Topic Number', root=RESULT_2)
lda_param.plot_perplexity('[Period 2] perplexity per Topic Number', root=RESULT_2)


In [None]:
NUM_TOPICS = 8

lda_param.calc_alpha(corpus=Corp, dictionary=Dict, num_topics=NUM_TOPICS)
lda_param.calc_eta(corpus=Corp, dictionary=Dict, num_topics=NUM_TOPICS)

lda_param.plot_alpha('[Period 2] Coherence per alpha', root=RESULT_2)
lda_param.plot_eta('[Period 2] Coherence per eta', root=RESULT_2)

In [None]:
NUM_TOPICS = 8
ALPHA = 0.002
ETA = 0.05

Corp, Dict = buildDTM(wo_stopword_2)
model = models.ldamodel.LdaModel(Corp, id2word=Dict, num_topics=NUM_TOPICS, alpha=ALPHA, eta=ETA, random_state=random_state)
topicdf = topicWords(model, NUM_TOPIC_WORDS)
topicdf.to_csv(RESULT_2+'[Period 2] topic words.csv', index=False)
data = visualizeLDA(model, Corp, Dict)

data

#### Period 3

In [None]:
lda_param = BestLDAPram(passes=PASSES, iterations=ITERATIONS, eval_every=EVAL_EVERY)

In [None]:
Corp, Dict = buildDTM(wo_stopword_3)

In [None]:
lda_param.calc_coherence(corpus=Corp, dictionary=Dict)
lda_param.calc_perplexity(corpus=Corp, dictionary=Dict)

lda_param.plot_coherence('[Period 3] Coherence per Topic Number', root=RESULT_3)
lda_param.plot_perplexity('[Period 3] perplexity per Topic Number', root=RESULT_3)

In [None]:
NUM_TOPICS = 5

lda_param.calc_alpha(corpus=Corp, dictionary=Dict, num_topics=NUM_TOPICS)
lda_param.calc_eta(corpus=Corp, dictionary=Dict, num_topics=NUM_TOPICS)

lda_param.plot_alpha(title='[Period 3] Coherence per alpha', root=RESULT_3)
lda_param.plot_eta(title='[Period 3] Coherence per eta', root=RESULT_3)

In [None]:
NUM_TOPICS = 5
ALPHA = 0.005
ETA = 0.04

Corp, Dict = buildDTM(wo_stopword_3)
model = models.ldamodel.LdaModel(Corp, id2word=Dict, num_topics=NUM_TOPICS, alpha=ALPHA, eta=ETA, random_state=random_state)
topicdf = topicWords(model, NUM_TOPIC_WORDS)
topicdf.to_csv(RESULT_3+'[Period 3] topic words.csv', index=False)
data = visualizeLDA(model, Corp, Dict)

data