In [1]:
import os, sys
sys.path.append('../')

import datasets
import pandas as pd
import glob
import string
from collections import Counter
from panlex_utils import load_panlex_resources, extract_monolingual_lexicon

In [2]:
# Load Wiki
wiki_dsets = {}
for lang in ['bug', 'mad', 'jv', 'min', 'su', 'map-bms', 'ace', 'gor', 'ban', 'bjn', 'nia']:
    if lang == 'su':
        iso_lang = 'sun'
    elif lang == 'jv':
        iso_lang = 'jav'
    elif lang == 'id':
        iso_lang = 'ind'
    elif lang == 'en':
        iso_lang = 'eng'
    else:
        iso_lang = lang
    print(f'processing {lang} {iso_lang}')
    wiki_dsets[iso_lang] = datasets.load_dataset('olm/wikipedia', language=lang, date="20221220")['train'].to_pandas()

processing bug bug


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.bug-date=20221220,language=bug/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

processing mad mad


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.mad-date=20221220,language=mad/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

processing jv jav


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.jv-date=20221220,language=jv/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

processing min min


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.min-date=20221220,language=min/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

processing su sun


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.su-date=20221220,language=su/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

processing map-bms map-bms


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.map-bms-date=20221220,language=map-bms/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

processing ace ace


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.ace-date=20221220,language=ace/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

processing gor gor


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.gor-date=20221220,language=gor/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

processing ban ban


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.ban-date=20221220,language=ban/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

processing bjn bjn


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.bjn-date=20221220,language=bjn/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

processing nia nia


Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.nia-date=20221220,language=nia/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# Load Paragraph
paragraph_dsets = {}
for path in glob.glob('../data/nusa_alinea-paragraph-*.csv'):
    _, _, lang, _ = path[:-4].split('/')[-1].split('-')

    if lang not in paragraph_dsets:
        paragraph_dsets[lang] = []
    paragraph_dsets[lang].append(pd.read_csv(path))

for lang, dfs in paragraph_dsets.items():
    paragraph_dsets[lang] = pd.concat(dfs)

In [6]:
# Load MT
mt_dsets = {}
for path in glob.glob('../data/nusa_kalimat-mt-*.csv'):
    _, _, lang, _ = path[:-4].split('/')[-1].split('-')

    if lang not in mt_dsets:
        mt_dsets[lang] = []
    mt_dsets[lang].append(pd.read_csv(path))

for lang, dfs in mt_dsets.items():
    mt_dsets[lang] = pd.concat(dfs)
    mt_dsets[lang]['text'] = mt_dsets[lang]['tgt_text']

In [11]:
%%time

# Chunk & Count Text
wiki_counters = {}
paragraph_counters = {}
mt_counters = {}

all_langs = list(set(mt_dsets.keys()).union(set(paragraph_dsets.keys())).union(set(wiki_dsets.keys())))
for lang in all_langs:
    # Clean Text
    replacement_rules = str.maketrans('', '', string.punctuation)
    if lang in wiki_dsets:
        wiki_dsets[lang]['clean_text'] = wiki_dsets[lang]['text'].apply(lambda x: x.lower().translate(replacement_rules).replace('\n',' '))
    if lang in paragraph_dsets:
        paragraph_dsets[lang]['clean_text'] = paragraph_dsets[lang]['text'].apply(lambda x: x.lower().translate(replacement_rules).replace('\n',' '))
    if lang in mt_dsets:
        mt_dsets[lang]['clean_text'] = mt_dsets[lang]['text'].apply(lambda x: x.lower().translate(replacement_rules).replace('\n',' '))
        
    # Chunk & Count Text
    wiki_counter = Counter()
    paragraph_counter = Counter()
    mt_counter = Counter()
    
    if lang in wiki_dsets:
        for ct in wiki_dsets[lang]['clean_text']:
            for word in ct.split(' '):
                if len(word) > 0:
                    wiki_counter[word] += 1

    if lang in paragraph_dsets:
        for ct in paragraph_dsets[lang]['clean_text']:
            for word in ct.split(' '):
                if len(word) > 0:
                    paragraph_counter[word] += 1

    if lang in mt_dsets:
        for ct in mt_dsets[lang]['clean_text']:
            for word in ct.split(' '):
                if len(word) > 0:
                    mt_counter[word] += 1
                    
    wiki_counters[lang] = wiki_counter
    paragraph_counters[lang] = paragraph_counter
    mt_counters[lang] = mt_counter

CPU times: user 31.4 s, sys: 303 ms, total: 31.7 s
Wall time: 31.7 s


### General Statistics

In [12]:
# Number of document
for lang in all_langs:
    wiki_doc, para_doc, mt_doc = 0, 0, 0
    
    if lang in wiki_dsets:
        wiki_doc = len(wiki_dsets[lang])
    if lang in paragraph_dsets:
        para_doc = len(paragraph_dsets[lang])
    if lang in mt_dsets:
        mt_doc = len(mt_dsets[lang])
    
    print(f'{lang} {wiki_doc} {para_doc} {mt_doc}')    

mui 0 1474 1574
jav 72737 10188 9449
btk 0 4908 9449
bjn 10122 0 0
gor 14389 0 0
min 226237 8608 9449
bhp 0 0 1579
map-bms 13575 0 0
ace 12829 0 0
bug 15866 1000 0
mad 1002 5211 9449
sun 61331 9594 9449
nia 1620 0 0
ban 17161 0 0
rej 0 1200 1574
abs 0 0 1574
bew 0 9755 9449
mak 0 5471 9449


In [13]:
# Number of tokens
for lang in all_langs:
    wiki_num, para_num, mt_num = 0, 0, 0
    
    if lang in wiki_dsets:
        wiki_num = sum(wiki_counters[lang].values())
    if lang in paragraph_dsets:
        para_num = sum(paragraph_counters[lang].values())
    if lang in mt_dsets:
        mt_num = sum(mt_counters[lang].values())
    print(f'{lang} {wiki_num} {para_num} {mt_num}')

mui 0 182632 36233
jav 8519436 1116678 208034
btk 0 562947 214165
bjn 680176 0 0
gor 603073 0 0
min 12623423 960961 211084
bhp 0 0 32988
map-bms 548796 0 0
ace 489036 0 0
bug 287823 118392 0
mad 106335 573833 211503
sun 5514173 1111848 209492
nia 246643 0 0
ban 1691041 0 0
rej 0 156900 34668
abs 0 0 37816
bew 0 1164932 209867
mak 0 609262 191528


In [14]:
# Number of tokens / document
for lang in all_langs:
    wiki_num, para_num, mt_num = 0, 0, 0
    wiki_doc, para_doc, mt_doc = 1, 1, 1
    
    if lang in wiki_dsets:
        wiki_num = sum(wiki_counters[lang].values())
        wiki_doc = len(wiki_dsets[lang])
    if lang in paragraph_dsets:
        para_num = sum(paragraph_counters[lang].values())
        para_doc = len(paragraph_dsets[lang])
    if lang in mt_dsets:
        mt_num = sum(mt_counters[lang].values())
        mt_doc = len(mt_dsets[lang])
    
    print(f'{lang} {wiki_num / wiki_doc} {para_num / para_doc} {mt_num / mt_doc}')

mui 0.0 123.9023066485753 23.01969504447268
jav 117.12657932001595 109.60718492343933 22.016509683564397
btk 0.0 114.69987775061125 22.665361413906233
bjn 67.19778699861688 0.0 0.0
gor 41.912085620960454 0.0 0.0
min 55.79734084168372 111.63580390334573 22.339295163509366
bhp 0.0 0.0 20.89170360987967
map-bms 40.42696132596685 0.0 0.0
ace 38.119572842778084 0.0 0.0
bug 18.140867263330392 118.392 0.0
mad 106.12275449101796 110.11955478794857 22.383638480262462
sun 89.90841499404868 115.88993120700438 22.17081172610858
nia 152.24876543209876 0.0 0.0
ban 98.53977040964979 0.0 0.0
rej 0.0 130.75 22.025412960609913
abs 0.0 0.0 24.025412960609913
bew 0.0 119.41896463352127 22.21049846544608
mak 0.0 111.36209102540668 20.269658164885172


In [15]:
# Number of Unique Tokens
for lang in all_langs:
    wiki_len, para_len, mt_len = 0, 0, 0
    
    if lang in wiki_dsets:
        wiki_len = len(wiki_counters[lang])
    if lang in paragraph_dsets:
        para_len = len(paragraph_counters[lang])
    if lang in mt_dsets:
        mt_len = len(mt_counters[lang])
    print(f'{lang} {wiki_len} {para_len} {mt_len}')

mui 0 17847 8546
jav 483499 45190 28707
btk 0 37239 33514
bjn 82072 0 0
gor 45223 0 0
min 284476 46955 26980
bhp 0 0 7003
map-bms 57888 0 0
ace 36606 0 0
bug 17542 12909 0
mad 23085 42443 34422
sun 289629 47648 25831
nia 25927 0 0
ban 135264 0 0
rej 0 9895 9061
abs 0 0 5877
bew 0 60834 28681
mak 0 49036 35482


In [16]:
# Percentage of novel words
for lang in all_langs:
    wiki_words, para_words, mt_words = set(), set(), set()

    if lang in wiki_dsets:
        wiki_words = set(wiki_counters[lang].keys())
    if lang in paragraph_dsets:
        para_words = set(paragraph_counters[lang].keys())
    if lang in mt_dsets:
        mt_words = set(mt_counters[lang].keys())

    print('{} {} {} {}'.format(
        lang, len(wiki_words), 
        len(para_words - wiki_words) / (len(para_words) + 1),
        len(mt_words - wiki_words) / (len(mt_words) + 1)
    ))

mui 0 0.9999439713133124 0.9998829998829999
jav 483499 0.410258679825629 0.43667270447262085
btk 0 0.9999731471535983 0.999970162613755
bjn 82072 0.0 0.0
gor 45223 0.0 0.0
min 284476 0.4418817616492035 0.4901597420406953
bhp 0 0.0 0.9998572244431754
map-bms 57888 0.0 0.0
ace 36606 0.0 0.0
bug 17542 0.9061967467079783 0.0
mad 23085 0.8601451324097634 0.8637538854835429
sun 289629 0.38248441730151733 0.44758439145246204
nia 25927 0.0 0.0
ban 135264 0.0 0.0
rej 0 0.9998989490703315 0.9998896490840874
abs 0 0.0 0.9998298741068391
bew 0 0.9999835620941891 0.9999651349278293
mak 0 0.999979607235353 0.9999718174900657


In [17]:
# Percentage of novel words
for lang in all_langs:
    wiki_words, para_words, mt_words = set(), set(), set()

    if lang in wiki_dsets:
        wiki_words = set(wiki_counters[lang].keys())
    if lang in paragraph_dsets:
        para_words = set(paragraph_counters[lang].keys())
    if lang in mt_dsets:
        mt_words = set(mt_counters[lang].keys())

    print('{} {} {} {}'.format(
        lang, len(wiki_words), 
        len(wiki_words - para_words) / (len(wiki_words) + 1),
        len(wiki_words - mt_words) / (len(wiki_words) + 1)
    ))

mui 0 0.0 0.0
jav 483499 0.9448790072388832 0.9665522233712512
btk 0 0.0 0.0
bjn 82072 0.9999878157250253 0.9999878157250253
gor 45223 0.9999778878471608 0.9999778878471608
min 284476 0.9078765594406578 0.9516445969269923
bhp 0 0.0 0.0
map-bms 57888 0.9999827255609874 0.9999827255609874
ace 36606 0.9999726828202257 0.9999726828202257
bug 17542 0.9309696175112581 0.9999429972068631
mad 23085 0.742874469375379 0.7968465736810187
sun 289629 0.8984083140558644 0.9507302420329385
nia 25927 0.999961431656896 0.999961431656896
ban 135264 0.9999926071045725 0.9999926071045725
rej 0 0.0 0.0
abs 0 0.0 0.0
bew 0 0.0 0.0
mak 0 0.0 0.0


In [18]:
# Percentage of Unique Word
for lang in all_langs:
    wiki_words, para_words, mt_words = set(), set(), set()

    if lang in wiki_dsets:
        wiki_words = set(wiki_counters[lang].keys())
    if lang in paragraph_dsets:
        para_words = set(paragraph_counters[lang].keys())
    if lang in mt_dsets:
        mt_words = set(mt_counters[lang].keys())
        
    print('{} {} {} {} {}'.format(
        lang, len(wiki_words), 
        len(wiki_words) / (sum(list(wiki_counters[lang].values())) + 1),
        len(para_words) / (sum(list(paragraph_counters[lang].values())) + 1),
        len(mt_words) / (sum(list(mt_counters[lang].values())) + 1)
    ))

mui 0 0.0 0.0977205652866678 0.23585582601976043
jav 483499 0.05675245911202818 0.040468209754101224 0.13799120340327348
btk 0 0.0 0.06614998188109737 0.15648609022907464
bjn 82072 0.12066270985346461 0.0 0.0
gor 45223 0.07498748080666717 0.0 0.0
min 284476 0.022535565627836 0.04886249404242832 0.1278158087974039
bhp 0 0.0 0.0 0.2122828821728455
map-bms 57888 0.1054816261750702 0.0 0.0
ace 36606 0.07485323196404362 0.0 0.0
bug 17542 0.0609469675913058 0.10903516255184006 0.0
mad 23085 0.21709486909419198 0.07396389896729716 0.1627486950601407
sun 289629 0.05252445787891351 0.04285474016705506 0.1233024492465142
nia 25927 0.10511911905418336 0.0 0.0
ban 135264 0.07998855143751604 0.0 0.0
rej 0 0.0 0.06306524496338456 0.2613574086359572
abs 0 0.0 0.0 0.15540629875452838
bew 0 0.0 0.05222102902055311 0.13666209236281854
mak 0 0.0 0.08048412590293519 0.1852565407849464


In [19]:
# Percentage of Unique Word
for lang in all_langs:
    data = {}
    if lang in wiki_dsets:
        data['wiki_top'] = list(map(lambda x: x[0], wiki_counters[lang].most_common(100)))
        data['wiki_cnt'] = list(map(lambda x: x[1], wiki_counters[lang].most_common(100)))
    if lang in paragraph_dsets:
        data['para_top'] = list(map(lambda x: x[0], paragraph_counters[lang].most_common(100)))
        data['para_cnt'] = list(map(lambda x: x[1], paragraph_counters[lang].most_common(100)))       
    if lang in mt_dsets:
        data['mt_top'] = list(map(lambda x: x[0], mt_counters[lang].most_common(100)))
        data['mt_cnt'] = list(map(lambda x: x[1], mt_counters[lang].most_common(100)))
    pd.DataFrame(data).to_csv(f'{lang}_top100.csv', index=False)

### Lexicon Overlapping

In [21]:
%%time
# Load Lexicon Indonesia & English
if os.path.exists('ind_lexicon.zip') and os.path.exists('eng_lexicon.zip'):
    ind_lexicon = pd.read_pickle('ind_lexicon.zip')
    eng_lexicon = pd.read_pickle('eng_lexicon.zip')
else:
    langvar_df, expr_df, deno_df = load_panlex_resources('../resources/panlex-20230501-csv')
    
    ind_lexicon = extract_monolingual_lexicon('ind', langvar_df, expr_df)
    eng_lexicon = extract_monolingual_lexicon('eng', langvar_df, expr_df)
    
    ind_lexicon.to_pickle('ind_lexicon.zip')
    eng_lexicon.to_pickle('eng_lexicon.zip')

CPU times: user 1min 24s, sys: 4.98 s, total: 1min 29s
Wall time: 1min 29s


In [22]:
ind_words, eng_words = set(), set()
replacement_rules = str.maketrans('', '', string.punctuation)

for word in ind_lexicon['ind'].values:
    ind_words.add(word.lower().translate(replacement_rules))
for word in eng_lexicon['eng'].values:
    eng_words.add(word.lower().translate(replacement_rules))

In [24]:
min_threshold = 5
for lang in all_langs:
    wiki_words, para_words, mt_words = set(), set(), set()

    if lang in wiki_dsets:
        wiki_counter = {x: count for x, count in wiki_counters[lang].items() if count >= min_threshold}
        wiki_words = set(wiki_counter.keys())
    if lang in paragraph_dsets:
        paragraph_counter = {x: count for x, count in paragraph_counters[lang].items() if count >= min_threshold}
        para_words = set(paragraph_counter.keys())
    if lang in mt_dsets:
        mt_counter = {x: count for x, count in mt_counters[lang].items() if count >= min_threshold}
        mt_words = set(mt_counter.keys())

    print('{} {} {} {} {}'.format(
        lang, 'ind',
        len(wiki_words.intersection(ind_words)) / len(wiki_words) if len(wiki_words) > 0 else 0,
        len(para_words.intersection(ind_words)) / len(para_words) if len(para_words) > 0 else 0,
        len(mt_words.intersection(ind_words)) / len(mt_words) if len(mt_words) > 0 else 0
    ))

    print('{} {} {} {} {}'.format(
        lang, 'eng',
        len(wiki_words.intersection(eng_words)) / len(wiki_words) if len(wiki_words) > 0 else 0,
        len(para_words.intersection(eng_words)) / len(para_words) if len(para_words) > 0 else 0,
        len(mt_words.intersection(eng_words)) / len(mt_words) if len(mt_words) > 0 else 0
    ))

mui ind 0 0.5640020898641588 0.5984481086323957
mui eng 0 0.3014629049111808 0.4248302618816683
jav ind 0.23096331608627405 0.41798082869511444 0.4376122082585278
jav eng 0.43225393872449275 0.2627551020408163 0.3256283662477558
btk ind 0 0.4394136422745005 0.4337637494021999
btk eng 0 0.28915947511526185 0.3703969392635103
bjn ind 0.4835850801479655 0 0
bjn eng 0.39603884093711467 0 0
gor ind 0.5395945945945946 0 0
gor eng 0.41175675675675677 0 0
min ind 0.2435402918486848 0.4165727170236753 0.44605858537747084
min eng 0.33423180592991913 0.25696569495893057 0.3362705406049059
bhp ind 0 0 0.41485714285714287
bhp eng 0 0 0.5497142857142857
map-bms ind 0.5681731137495213 0 0
map-bms eng 0.3701646878590578 0 0
ace ind 0.36175548589341694 0 0
ace eng 0.42424242424242425 0 0
bug ind 0.3902439024390244 0.31163130943672274 0
bug eng 0.48267008985879334 0.2571324067300658 0
mad ind 0.45634534242129177 0.30735489135978783 0.33533057851239667
mad eng 0.4118792599805258 0.25563602978679995 0.297