In [36]:
import pandas as pd, json, os, math
from pathlib import Path
import re
from collections import Counter

mother_path = "/home/hyohyeongjang/2024aut_comstudy1/data"
test1ing_data = "NLRW1900000011.json"
test_data1 = "NLRW1900000020.json"
test_data2 = "WARW1900003745.json"

def extract_form(mother_path, data_path):
    
    korean = re.compile(r"[가-힣]")
    with open(f"{mother_path}/{data_path}", "r") as f:
        x = json.load(f)

    # extract form
    forms = [k['form'] for j in [i['paragraph'] for i in x['document']] for k in j]
    
    # extract only korean characters
    return [re.sub(r"[^가-힣\s]", "", i) for i in forms]

# list of sentences
train_sent = extract_form(mother_path, training_data)
test_sent1 = extract_form(mother_path, test_data1)
test_sent2 = extract_form(mother_path, test_data2)


(5527409, 15.417394991347436)

In [47]:
def get_uni(chars):
    
    # unilst and count
    uni = [j for k in chars for j in k]
    uni_counter = Counter(uni)

    return uni_counter

def get_bi(chars):

    # make bigram and unlist and count
    bi = [[i+j for i, j in zip(k, k[1:])] for k in chars]
    bi = [j for k in bi for j in k]
    bi_counter = Counter(bi)
    
    return bi_counter

def get_tri(chars):

    # make trigran, unlist, count
    tri = [[i+j+l for i, j, l in zip(k, k[1:], k[2:])] for k in char]
    tri = [j for k in tri for j in k]
    tri_counter = Counter(tri)

    return tri_counter

## unigram, bigram, trigram for training_data
# split sentence by syllables
char_test1 = [list(i) for i in train_sent]
char_test1 = [list(i) for i in test_sent1]
char_test2 = [list(i) for i in test_sent2]

uni_counter_train = get_uni(char_train)
bi_counter_train = get_bi(char_train)
tri_counter_train = get_tri(char_train)

uni_counter_test1 = get_uni(char_test1)
bi_counter_test1 = get_bi(char_test1)
tri_counter_test1 = get_tri(char_test1)

uni_counter_test2 = get_uni(char_test2)
bi_counter_test2 = get_bi(char_test2)
tri_counter_test2 = get_tri(char_test2)


In [49]:

def get_stats(counter):

    tot_count = sum([counter[key] for key in counter])
    prob = {key: counter[key] / tot_count for key in counter}
    E = sum([-prob[key] * math.log2(prob[key]) for key in prob])

    return prob, E

uni_prob_train, uni_E_train = get_stats(uni_counter_train)
bi_prob_train, bi_E_train = get_stats(bi_counter_train)
tri_prob_train, tri_E_train = get_stats(tri_counter_train)

_, uni_E_test1 = get_stats(uni_counter_test1)
_, bi_E_test1 = get_stats(bi_counter_test1)
_, tri_E_test1 = get_stats(tri_counter_test1)

_, uni_E_test2 = get_stats(uni_counter_test2)
_, bi_E_test2 = get_stats(bi_counter_test2)
_, tri_E_test2 = get_stats(tri_counter_test2)



In [69]:
def get_CE_stats(counter, gold_probs):

    # add-1 smoothing
    not_existing_keys = set(gold_probs.keys()) - set(counter.keys())
    for i in not_existing_keys:
        counter[i] = 1
    tot_count = sum([counter[key] for key in counter])
    
    prob = {key: counter[key] / tot_count for key in counter}
    CE = sum([-gold_probs.get(key, 0) * math.log2(prob[key]) for key in prob])

    return CE
    
uni_CE_train = get_CE_stats(uni_counter_train, uni_prob_train)
bi_CE_train = get_CE_stats(bi_counter_train, bi_prob_train)
tri_CE_train = get_CE_stats(tri_counter_train, tri_prob_train)

uni_CE_test1 = get_CE_stats(uni_counter_test1, uni_prob_train)
bi_CE_test1 = get_CE_stats(bi_counter_test1, bi_prob_train)
tri_CE_test1 = get_CE_stats(tri_counter_test1, tri_prob_train)

uni_CE_test2 = get_CE_stats(uni_counter_test2, uni_prob_train)
bi_CE_test2 = get_CE_stats(bi_counter_test2, bi_prob_train)
tri_CE_test2 = get_CE_stats(tri_counter_test2, tri_prob_train)

In [85]:
#print result

print(
f"""
\t\t\t\tEntropy\t\t\tCross-entropoy\t\tDifference
training_data\tunigram\t\t{uni_E_train}\t{uni_CE_train}\t{uni_CE_train - uni_E_train}
\t\tbigram\t\t{bi_E_train}\t{bi_CE_train}\t{bi_CE_train - bi_E_train}
\t\ttrigram\t\t{tri_E_train}\t{tri_CE_train}\t{tri_CE_train - tri_E_train}

test_data\tunigram\t\t{uni_E_test1}\t{uni_CE_test1}\t{uni_CE_test1 - uni_E_test1}
\t\tbigram\t\t{bi_E_test1}\t{bi_CE_test1}\t{bi_CE_test1 - bi_E_test1}
\t\ttrigram\t\t{tri_E_test1}\t{tri_CE_test1}\t{tri_CE_test1 -tri_E_test1}

test_data2\tunigram\t\t{uni_E_test2}\t{uni_CE_test2}\t{uni_CE_test2 - uni_E_test2}
\t\tbigram\t\t{bi_E_test2}\t{bi_CE_test2}\t{bi_CE_test2 - bi_E_test2}
\t\ttrigram\t\t{tri_E_test2}\t{tri_CE_test2}\t{tri_CE_test2 - tri_E_test2}
"""
)



				Entropy			Cross-entropoy		Difference
training_data	unigram		6.898936302224395	6.898936302224395	0.0
		bigram		11.790961373288066	11.790961373288066	0.0
		trigram		15.417394991347436	15.417394991347436	0.0

test_data	unigram		6.914437314870924	6.930033575129791	0.015596260258866401
		bigram		11.689952196046649	12.022411853262458	0.33245965721580895
		trigram		15.417394991347436	15.417394991347436	0.0

test_data2	unigram		6.731913420946825	7.451821905575859	0.719908484629034
		bigram		11.063790641412984	13.525015586860407	2.461224945447423
		trigram		15.417394991347436	15.417394991347436	0.0

