# LZW Compression

The goal of this task is to implement the LZW algorithm for data compression. Let's begin by importing all the necessary libraries in Python.


In [28]:
import struct
from PIL import Image
import os

Then, let's write the basic functions for compression and decompression and check if they work correctly.


In [29]:
def lzw_compress(data):
    dictionary = {chr(i): i for i in range(256)}
    dict_size = 256
    result = []
    w = ""

    for c in data:
        wc = w + c
        if wc in dictionary:
            w = wc
        else:
            result.append(dictionary[w])
            dictionary[wc] = dict_size
            dict_size += 1
            w = c

    if w:
        result.append(dictionary[w])

    return result

def lzw_decompress(compressed):
    """Decompress a list of output ks to a string using LZW."""
    dictionary = {i: chr(i) for i in range(256)}
    dict_size = 256
    result = []

    w = chr(compressed.pop(0))
    result.append(w)

    for k in compressed:
        if k in dictionary:
            entry = dictionary[k]
        elif k == dict_size:
            entry = w + w[0]
        else:
            raise ValueError("Bad compressed k: %s" % k)

        result.append(entry)

        dictionary[dict_size] = w + entry[0]
        dict_size += 1

        w = entry

    return "".join(result)

text_original = "Sample text for LZW compression"
print(f"Original text: {text_original}")
compressed = lzw_compress(text_original)
print(f"Text after compression and decompression: {lzw_decompress(compressed)}")

Original text: Sample text for LZW compression
Text after compression and decompression: Sample text for LZW compression


The goal of this task is to apply LZW compression on `.txt` and `.bmp` files, so we also need some functions for reading and saving data before and after compression.


In [30]:
def read_text_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

def save_compressed_file(filepath, compressed_data):
    with open(filepath, 'wb') as f:
        for code in compressed_data:
            f.write(struct.pack('>I', code))  # Use 32-bit unsigned integers

def read_compressed_file(filepath):
    with open(filepath, 'rb') as f:
        compressed_data = []
        while byte := f.read(4):  # Read 4 bytes for 32-bit integers
            compressed_data.append(struct.unpack('>I', byte)[0])
    return compressed_data

def decompress_file(input_binary_path):
    compressed_data = read_compressed_file(input_binary_path)
    decompressed_data = lzw_decompress(compressed_data)
    
    return decompressed_data


def compress_image(input_image_path, output_binary_path):
    with open(input_image_path, 'rb') as f:
        image_data = f.read()
    compressed_data = lzw_compress(image_data.decode('latin1'))
    save_compressed_file(output_binary_path, compressed_data)


Finally, compress all the files specified in the task description.


In [31]:
def get_file_size_in_mb(file_path):
    return os.path.getsize(file_path) / (1024 * 1024)  # Convert bytes to MB

text_files = ["norm_wiki_sample.txt", "wiki_sample.txt"]
for text_file in text_files:
    print(f"\nTesting LZW on {text_file}:")
    
    original_size = get_file_size_in_mb(text_file)
    print(f"Original size: {original_size:.2f} MB")
    
    data = read_text_file(text_file)
    compressed = lzw_compress(data)
    compressed_binary_file = text_file.replace(".txt", "_compressed.bin")
    save_compressed_file(compressed_binary_file, compressed)
    
    compressed_size = get_file_size_in_mb(compressed_binary_file)
    print(f"Compressed size: {compressed_size:.2f} MB")
    decompressed = decompress_file(compressed_binary_file)

    assert data == decompressed, "Decompression failed from compressed binary file!"

print("\nTesting LZW on lena.bmp:")
image_size = get_file_size_in_mb("lena.bmp")
print(f"Original image size: {image_size:.2f} MB")
compress_image("lena.bmp", "lena_compressed.lzw")
compressed_image_size = get_file_size_in_mb("lena_compressed.lzw")
print(f"Compressed image size: {compressed_image_size:.2f} MB")



Testing LZW on norm_wiki_sample.txt:
Original size: 10.29 MB
Compressed size: 6.03 MB

Testing LZW on wiki_sample.txt:
Original size: 11.36 MB
Compressed size: 6.84 MB

Testing LZW on lena.bmp:
Original image size: 10.99 MB
Compressed image size: 10.24 MB
