In [17]:
from collections import Counter
from math import log2
from pathlib import Path
import pandas as pd

In [18]:
files = ["bib", "book1", "book2", "geo", "news", "obj1", "obj2", "paper1", "paper2", "paper3", "paper4", "paper5", "paper6", "pic", "progc", "progl", "progp", "trans"]
coded_files = list(map(lambda file: file + '.coded', files))

In [19]:
def entropy(text):
    counter = Counter(text)
    total = len(text)
    return -sum((count / total) * log2(count / total) for count in counter.values())

In [20]:
def conditional_entropy_1(text):
    pair_counts = Counter(zip(text[:-1], text[1:]))
    first_counts = Counter(text[:-1])
    total = len(text) - 1

    h = 0.0
    for (a, b), pair_count in pair_counts.items():
        p_ab = pair_count / total
        h -= p_ab * log2(pair_count / first_counts[a])
    return h

In [21]:
def conditional_entropy_2(text):
    triplet_counts = Counter(zip(text[:-2], text[1:-1], text[2:]))
    pair_counts = Counter(zip(text[:-2], text[1:-1]))
    total = len(text) - 2

    h = 0.0
    for (a, b, c), triplet_count in triplet_counts.items():
        p_abc = triplet_count / total
        h -= p_abc * log2(triplet_count / pair_counts[(a, b)])
    return h

In [22]:
data = []
for file, coded_file in zip(files, coded_files):
    text_path = Path(f"test/{file}")
    coded_path = Path(f"test/{coded_file}")
    with open(text_path, "rb") as input:
        text = input.read()

    h_x = entropy(text)
    h_x_given_x = conditional_entropy_1(text)
    h_x_given_xx = conditional_entropy_2(text)
    compressed_size_bytes = coded_path.stat().st_size
    original_size_bytes = text_path.stat().st_size
    avg_bits_per_symbol = (compressed_size_bytes * 8) / len(text)

    data.append([file, h_x, h_x_given_x, h_x_given_xx, avg_bits_per_symbol, original_size_bytes, compressed_size_bytes])

df = pd.DataFrame(data, columns=['File', 'H(X)', 'H(X|X)', 'H(X|XX)', 'Avg Bits per Symbol', 'Original Size (bytes)', 'Compressed Size (bytes)'])
df

df.loc['Total', 'Compressed Size (bytes)'] = df['Compressed Size (bytes)'].sum()
df.loc['Total', 'Original Size (bytes)'] = df['Original Size (bytes)'].sum()
df

Unnamed: 0,File,H(X),H(X|X),H(X|XX),Avg Bits per Symbol,Original Size (bytes),Compressed Size (bytes)
0,bib,5.200676,3.364127,2.307505,2.400122,111261.0,33380.0
1,book1,4.527149,3.584518,2.814074,2.781957,768771.0,267336.0
2,book2,4.792633,3.745216,2.735674,2.451484,610856.0,187188.0
3,geo,5.646376,4.264226,3.457736,5.468281,102400.0,69994.0
4,news,5.189632,4.091893,2.922759,2.836633,377109.0,133715.0
5,obj1,5.948171,3.463658,1.40044,4.544643,21504.0,12216.0
6,obj2,6.260381,3.870373,2.265427,2.890079,246814.0,89164.0
7,paper1,4.982983,3.646085,2.331768,2.76985,53161.0,18406.0
8,paper2,4.601435,3.522351,2.513645,2.756919,82199.0,28327.0
9,paper3,4.665104,3.554845,2.559874,2.99583,46526.0,17423.0
