In [1]:
from sylseg import syllable_break
import json
from tqdm import tqdm
import pandas as pd

In [2]:
with open('aux/common-words.txt') as f:
    common_words = [l.strip() for l in f]
with open('aux/dict-words.txt') as f:
    dict_words = [l.strip() for l in f]
with open('aux/stop-words.txt') as f:
    stop_words = [l.strip() for l in f]
print(len(common_words), len(dict_words), len(stop_words))

26171 32885 275


In [12]:
# Transpiled from https://github.com/eimg/myanmar-text-breaker/blob/master/word-breaker.js
_input = 'နေကောင်း လား'

# This code is also in `wordseg.py`
# This rule-based word segmentation is functional though it has many weaknesses.
# The algorithm uses maximum matching, where the longest word is
# matched first from the dictionary and the length is lowered 1by1
# until finally length is 0 (not found in dictionary),
# when we declare the syllable to be its own word.
def wb(_input):
    _input = syllable_break(str(_input).replace(' ', ''))

    result = []
    offset = 0
    LIMIT = 7

    while offset < len(_input):
        chunk_end = offset + LIMIT
        chunk_found = False

        for i in range(chunk_end, offset, -1):
            chunk = ''.join(_input[offset:i])

            if chunk in dict_words or chunk in common_words or chunk in stop_words:
                # Found the word in data
                chunk_found = True
                result.append(chunk)

                # Resetting offset to resume
                offset = i
                break


        # Didn't found the word of any
        # long-short combination in the chunk
        if not chunk_found:
            # Now, the current syllable is a word
            result.append(_input[offset])
            offset += 1
    return result

In [5]:
print(wb(_input))

['နေကောင်း', 'လား']


## Fetch News Data, Word Segment and Save for Later Use
- This takes about 30 minutes, so we persist the transformed data.

In [6]:
irr = pd.read_csv('data/clean/irr.csv', sep='\t')
ele = pd.read_csv('data/clean/ele.csv', sep='\t')
miz = pd.read_csv('data/clean/miz.csv', sep='\t')
voi = pd.read_csv('data/clean/voi.csv', sep='\t')
dvb = pd.read_csv('data/clean/dvb.csv', sep='\t')

In [7]:
irr['wb_title'] = [' '.join(wb(t)) for t in tqdm(irr.title)]
irr['wb_body'] = [' '.join(wb(t)) for t in tqdm(irr.body)]

100%|██████████| 50/50 [00:03<00:00, 13.09it/s]
100%|██████████| 50/50 [08:03<00:00,  9.67s/it]


In [8]:
irr.to_csv('data/clean_wb/irr.csv', sep='\t', index=False)

In [9]:
ele['wb_title'] = [' '.join(wb(t)) for t in tqdm(ele.title)]
ele['wb_body'] = [' '.join(wb(t)) for t in tqdm(ele.body)]
ele.to_csv('data/clean_wb/ele.csv', sep='\t', index=False)

100%|██████████| 50/50 [00:08<00:00,  6.19it/s]
100%|██████████| 50/50 [03:34<00:00,  4.29s/it]


In [10]:
miz['wb_title'] = [' '.join(wb(t)) for t in tqdm(miz.title)]
miz['wb_body'] = [' '.join(wb(t)) for t in tqdm(miz.body)]
miz.to_csv('data/clean_wb/miz.csv', sep='\t', index=False)

100%|██████████| 50/50 [00:04<00:00, 11.57it/s]
100%|██████████| 50/50 [03:09<00:00,  3.78s/it]


In [13]:
voi['wb_title'] = [' '.join(wb(t)) for t in tqdm(voi.title)]
voi['wb_body'] = [' '.join(wb(t)) for t in tqdm(voi.body)]
voi.to_csv('data/clean_wb/voi.csv', sep='\t', index=False)


  0%|          | 0/50 [00:00<?, ?it/s][A
  2%|▏         | 1/50 [00:00<00:07,  6.48it/s][A
  4%|▍         | 2/50 [00:00<00:07,  6.22it/s][A
  8%|▊         | 4/50 [00:00<00:06,  7.31it/s][A
 10%|█         | 5/50 [00:00<00:06,  7.10it/s][A
 14%|█▍        | 7/50 [00:00<00:04,  8.72it/s][A
 16%|█▌        | 8/50 [00:00<00:05,  7.80it/s][A
 22%|██▏       | 11/50 [00:01<00:04,  9.66it/s][A
 26%|██▌       | 13/50 [00:01<00:03, 10.91it/s][A
 30%|███       | 15/50 [00:01<00:03, 10.57it/s][A
 34%|███▍      | 17/50 [00:01<00:03, 10.55it/s][A
 38%|███▊      | 19/50 [00:01<00:02, 11.30it/s][A
 42%|████▏     | 21/50 [00:01<00:02, 12.28it/s][A
 46%|████▌     | 23/50 [00:02<00:02, 11.79it/s][A
 50%|█████     | 25/50 [00:02<00:02, 11.84it/s][A
 54%|█████▍    | 27/50 [00:02<00:01, 11.70it/s][A
 58%|█████▊    | 29/50 [00:02<00:01, 12.33it/s][A
 62%|██████▏   | 31/50 [00:02<00:01, 10.07it/s][A
 68%|██████▊   | 34/50 [00:02<00:01, 12.26it/s][A
 72%|███████▏  | 36/50 [00:03<00:01, 12.18it/

In [14]:
dvb['wb_title'] = [' '.join(wb(t)) for t in tqdm(dvb.title)]
dvb['wb_body'] = [' '.join(wb(t)) for t in tqdm(dvb.body)]
dvb.to_csv('data/clean_wb/dvb.csv', sep='\t', index=False)


  0%|          | 0/50 [00:00<?, ?it/s][A
  4%|▍         | 2/50 [00:00<00:03, 15.88it/s][A
  8%|▊         | 4/50 [00:00<00:02, 15.83it/s][A
 12%|█▏        | 6/50 [00:00<00:02, 16.03it/s][A
 16%|█▌        | 8/50 [00:00<00:02, 15.49it/s][A
 22%|██▏       | 11/50 [00:00<00:02, 16.84it/s][A
 26%|██▌       | 13/50 [00:00<00:02, 15.54it/s][A
 30%|███       | 15/50 [00:00<00:02, 16.05it/s][A
 34%|███▍      | 17/50 [00:01<00:01, 16.69it/s][A
 38%|███▊      | 19/50 [00:01<00:01, 17.33it/s][A
 42%|████▏     | 21/50 [00:01<00:01, 16.71it/s][A
 46%|████▌     | 23/50 [00:01<00:01, 15.88it/s][A
 50%|█████     | 25/50 [00:01<00:01, 16.53it/s][A
 56%|█████▌    | 28/50 [00:01<00:01, 16.68it/s][A
 60%|██████    | 30/50 [00:01<00:01, 16.66it/s][A
 64%|██████▍   | 32/50 [00:02<00:01, 13.57it/s][A
 68%|██████▊   | 34/50 [00:02<00:01, 13.44it/s][A
 72%|███████▏  | 36/50 [00:02<00:01, 12.35it/s][A
 76%|███████▌  | 38/50 [00:02<00:00, 12.14it/s][A
 80%|████████  | 40/50 [00:02<00:00, 12.85i