In [33]:
import math

In [57]:
def load_text(filename):
    with open(f"../Data/{filename}", 'r', encoding='utf-8-sig') as f:
        text = f.read()
        return text

In [84]:
czech = load_text('czech.txt')
english = load_text('english.txt')
french = load_text('french.txt')
german = load_text('german.txt')
hungarian = load_text('hungarian.txt')
unknown0 = load_text('unknown0.txt')
unknown1 = load_text('unknown1.txt')
unknown2 = load_text('unknown2.txt')

texts = [
    ('Czech', czech),
    ('English', english),
    ('French', french),
    ('German', german),
    ('Hungarian', hungarian)
]

In [154]:
class Node:
    def __init__(self, key):
        self.key = key
        self.count = 0
        self.children = {}
        self.parent = None
        
def create_model(text, k):
    root = Node(None)
    lists = {}
    node_count = 1
    for i in range(len(text) - k + 1):
        key = text[i:i+k]
        #print(f"{i} = {key}")
        
        current = root
        current.count += 1
        for c in key:
            if not c in current.children:
                node = Node(c)
                node_count += 1
                current.children[c] = node
                node.parent = current
                current = node
            else:
                current = current.children[c]
            
            current.count += 1
            #print(f"{c} = {current.count}")
        
        lists[key] = current
    
    return (root, lists, node_count)

def model_entropy(model, k, text, lists):
    entropy = 0.0
    n = len(text) - k + 1
    for w, node in lists.items():
        p1 = node.count / n
        p2 = math.log2(node.count / node.parent.count)
        #print(f"p1={p1:<3.3} p2={p2:<3.3}")
        entropy += (p1 * p2)
            
    return -entropy

k = 1
text = 'abracadabra'
model, lists, node_count = create_model(text, k)
print(lists.keys())
entropy = model_entropy(model, k, text, lists)
print(entropy)

dict_keys(['a', 'b', 'r', 'c', 'd'])
2.0403733936884962


In [170]:
print(f"{'File':<10} {'Order':5} {'Count':>10} {'Entropy':>10}")
for name, text in texts:
    for k in [1, 2, 3, 4, 5, 6]:
        model, lists, node_count = create_model(text, k)
        entropy = model_entropy(model, k, text, lists)
        print(f"{name:<10} {k-1:^5} {node_count:>10} {entropy:>10.2f}")
    print()

File       Order      Count    Entropy
Czech        0          111       5.07
Czech        1         2231       3.86
Czech        2        13956       2.76
Czech        3        46551       1.79
Czech        4       102990       1.10
Czech        5       178579       0.70

English      0           88       4.65
English      1         1538       3.51
English      2         9260       2.50
English      3        31302       1.83
English      4        73739       1.33
English      5       137313       0.96

French       0           99       4.91
French       1         1714       3.57
French       2         9929       2.59
French       3        32752       1.82
French       4        76061       1.26
French       5       140826       0.86

German       0           94       4.73
German       1         1838       3.53
German       2        11864       2.66
German       3        42292       1.99
German       4       103998       1.45
German       5       200037       1.05

Hungarian    0       