## POC
<https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21>

In [28]:
import pickle
from gensim import corpora
import gensim
import pandas as pd
from string import punctuation as en_punc
from time import time

In [8]:
text_data = [
    ['နေကောင်း','လား'],
    ['သွား','စား','မယ်'],
    ['ဖုန်း','ပြော','နေ','တယ်']
]

In [9]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('aux/corpus.pkl', 'wb'))
dictionary.save('aux/dictionary.gensim')

In [13]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(
    corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('aux/model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.176*"မယ်" + 0.176*"သွား" + 0.176*"စား" + 0.176*"နေကောင်း"')
(1, '0.111*"နေကောင်း" + 0.111*"လား" + 0.111*"စား" + 0.111*"သွား"')
(2, '0.207*"ပြော" + 0.207*"ဖုန်း" + 0.207*"နေ" + 0.207*"တယ်"')
(3, '0.111*"လား" + 0.111*"နေကောင်း" + 0.111*"စား" + 0.111*"မယ်"')
(4, '0.111*"လား" + 0.111*"နေကောင်း" + 0.111*"စား" + 0.111*"သွား"')


### POC DONE

## LDA

In [34]:
with open('aux/stop-words.txt') as f:
    STOP_WORDS = [l.strip() for l in f]

def tokenize(text): # Assumes the text is already word-segmented and is space-separated
    return text.split()

def prepare_text_for_lda(text):
    text = text.translate(str.maketrans('', '', en_punc))
    tokens = tokenize(text)
#     tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in STOP_WORDS]
#     tokens = [get_lemma(token) for token in tokens]
    return tokens

In [36]:
irr = pd.read_csv('data/clean_wb/irr.csv', sep='\t', usecols=['wb_title','wb_body'])
ele = pd.read_csv('data/clean_wb/ele.csv', sep='\t', usecols=['wb_title','wb_body'])
miz = pd.read_csv('data/clean_wb/miz.csv', sep='\t', usecols=['wb_title','wb_body'])
voi = pd.read_csv('data/clean_wb/voi.csv', sep='\t', usecols=['wb_title','wb_body'])
dvb = pd.read_csv('data/clean_wb/dvb.csv', sep='\t', usecols=['wb_title','wb_body'])
text_data = [
    prepare_text_for_lda(t) for t in irr.wb_body.values] + [
    prepare_text_for_lda(t) for t in ele.wb_body.values] + [
    prepare_text_for_lda(t) for t in miz.wb_body.values] + [
    prepare_text_for_lda(t) for t in voi.wb_body.values] + [
    prepare_text_for_lda(t) for t in dvb.wb_body.values]
print(len(text_data))

250


In [37]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('aux/all_body_corpus.pkl', 'wb'))
dictionary.save('aux/all_body_dictionary.gensim')

In [40]:
# The top 10 topics
t0 = time()
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(
    corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('aux/all_body_model5.gensim')
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)
print(time() - t0)

(0, '0.014*"အလုပ်သမား" + 0.006*"ကန်" + 0.006*"ကျောက်ကပ်" + 0.006*"ဘတ်" + 0.006*"ဖို့" + 0.006*"လို့"')
(1, '0.015*"တစ်" + 0.012*"ဆို" + 0.009*"လို့" + 0.008*"ပဲ" + 0.008*"သွား" + 0.007*"နှစ်"')
(2, '0.010*"နိုင်" + 0.009*"ထား" + 0.008*"ပေး" + 0.007*"ဟု" + 0.007*"ကုန်" + 0.007*"အတွက်"')
(3, '0.027*"႔" + 0.021*"တြ" + 0.014*"တယ္" + 0.012*"န" + 0.010*"မွာ" + 0.009*"ၾ"')
(4, '0.016*"ယ" + 0.009*"တ" + 0.008*"လေး" + 0.007*"တစ်" + 0.007*"ဘူး" + 0.007*"ဆို"')
(5, '0.010*"ပါတီ" + 0.010*"တစ်" + 0.009*"ဟု" + 0.009*"နိုင်" + 0.008*"မည်" + 0.007*"ဖြင့်"')
(6, '0.010*"ရာခိုင်နှုန်း" + 0.009*"နှစ်" + 0.009*"ဒသမ" + 0.008*"နှုန်း" + 0.007*"ဆန်" + 0.007*"ခုနှစ်"')
(7, '0.028*"ဆို" + 0.021*"လို့" + 0.017*"ရင်" + 0.017*"ဘူး" + 0.016*"ပဲ" + 0.014*"မယ်"')
(8, '0.023*"တရုတ်" + 0.018*"နိုင်ငံ" + 0.013*"တ" + 0.009*"နိုင်" + 0.009*"မြန်မာနိုင်ငံ" + 0.008*"စီမံကိန်း"')
(9, '0.029*"င" + 0.025*"တ" + 0.023*"ဈ" + 0.023*"သ" + 0.020*"န" + 0.016*"တှ"')
10.42266058921814


In [39]:
# Sample topic extraction for first title
new_doc = irr.wb_title[0]
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc)
print(ldamodel.get_document_topics(new_doc_bow))

['တရုတ်', 'သမ္မတ', 'ခရီးစဉ်', 'သမုဒ္ဒရာ', '၂', 'စင်း', 'သေနင်္ဂဗျူဟာ', 'တွန်းအားပေး', 'ရည်ရွယ်']
[(0, 0.01000337), (1, 0.010003742), (2, 0.90996426), (3, 0.010003752), (4, 0.010003204), (5, 0.010004098), (6, 0.010002954), (7, 0.010004617), (8, 0.010006982), (9, 0.010003059)]


- The list of tuples represents the topics and their associated probabilities.