In [2]:
import heapq
from collections import defaultdict

class HuffmanNode:
    def __init__(self, char, freq):
        self.char = char  # 字符
        self.freq = freq  # 频率
        self.left = None  # 左子节点
        self.right = None  # 右子节点

    def __lt__(self, other):
        return self.freq < other.freq  # 定义节点之间的小于比较，根据频率进行比较

def huffman_encoding(data):
    if not data:
        return "", {}, {}  # 处理空数据的情况，返回空的编码结果、空的字符编码和频率

    # 统计字符频率
    freq_map = defaultdict(int)  # 用于存储字符频率的字典
    for char in data:
        freq_map[char] += 1  # 统计每个字符的出现频率

    # 创建Huffman树
    heap = []  # 最小堆，用于构建Huffman树
    for char, freq in freq_map.items():
        node = HuffmanNode(char, freq)  # 创建Huffman节点
        heapq.heappush(heap, node)  # 将节点加入最小堆

    if len(heap) == 1:
        # 处理只有一个字符的情况
        node = heapq.heappop(heap)  # 弹出唯一的节点
        root = HuffmanNode(None, node.freq)  # 创建虚拟根节点
        root.left = node  # 将节点作为根节点的左子节点
    else:
        while len(heap) > 1:
            left_node = heapq.heappop(heap)  # 弹出频率最低的节点作为左子节点
            right_node = heapq.heappop(heap)  # 弹出频率次低的节点作为右子节点
            merged_node = HuffmanNode(None, left_node.freq + right_node.freq)  # 创建合并节点，频率为左右子节点频率之和
            merged_node.left = left_node  # 将左子节点赋值给合并节点的左子节点
            merged_node.right = right_node  # 将右子节点赋值给合并节点的右子节点
            heapq.heappush(heap, merged_node)  # 将合并节点加入最小堆

        root = heapq.heappop(heap)  # 最后剩下的节点即为根节点

    # 生成编码映射
    code_map = {}  # 存储字符和对应编码的字典
    generate_code_map(root, "", code_map)  # 生成字符的编码映射

    # 生成编码结果
    encoded_data = "".join(code_map[char] for char in data)  # 根据编码映射将数据进行编码

    # 返回编码结果、字符编码和频率
    return encoded_data, code_map, freq_map

def generate_code_map(node, current_code, code_map):
    if node is None:
        return
    if node.char:
        code_map[node.char] = current_code  # 如果当前节点是叶子节点，将字符和对应编码添加到编码映射中
        return
    generate_code_map(node.left, current_code + "0", code_map)  # 递归遍历左子树，并在当前编码末尾添加'0'
    generate_code_map(node.right, current_code + "1", code_map)  # 递归遍历右子树，并在当前编码末尾添加'1'

# 测试代码
file_path = "orignal.txt"
compression_ratio = compress_file(file_path)
print(f"压缩率: {compression_ratio:.2f}%")


In [12]:
import heapq
import os
import string

# 定义 Huffman 节点类
class HuffmanNode:
    def __init__(self, char, frequency):
        self.char = char
        self.frequency = frequency
        self.left = None
        self.right = None

    # 定义节点之间的比较规则
    def __lt__(self, other):
        return self.frequency < other.frequency

# 统计字符频率
def count_frequencies(file_path):
    frequencies = {}
    with open(file_path, 'r') as file:
        for line in file:
            for char in line:
                if char.isalnum() or char in string.punctuation:
                    frequencies[char] = frequencies.get(char, 0) + 1
    return frequencies

# 生成 Huffman 编码树
def build_huffman_tree(frequencies):
    heap = []
    for char, frequency in frequencies.items():
        heapq.heappush(heap, HuffmanNode(char, frequency))
    
    while len(heap) > 1:
        node1 = heapq.heappop(heap)
        node2 = heapq.heappop(heap)
        merged_frequency = node1.frequency + node2.frequency
        merged_node = HuffmanNode(None, merged_frequency)
        merged_node.left = node1
        merged_node.right = node2
        heapq.heappush(heap, merged_node)
    
    return heap[0]

# 递归生成 Huffman 编码表
def generate_huffman_codes(node, current_code, codes):
    if node.char:
        codes[node.char] = current_code
    else:
        generate_huffman_codes(node.left, current_code + '0', codes)
        generate_huffman_codes(node.right, current_code + '1', codes)

# 将编码结果写入文件，并返回压缩率
def write_huffman_codes(file_path, frequencies, codes):
    total_bits = 0
    compressed_bits = 0
    with open(file_path, 'w') as file:
        file.write("字符 出现频率 编码\n")
        for char, frequency in frequencies.items():
            code = codes[char]
            file.write(f"{char} {frequency} {code}\n")
            total_bits += frequency * 8
            compressed_bits += frequency * len(code)
    
    compression_ratio = (1 - compressed_bits / total_bits) * 100
    return compression_ratio

# 主函数
def compress_file(file_path):
    frequencies = count_frequencies(file_path)
    huffman_tree = build_huffman_tree(frequencies)
    codes = {}
    generate_huffman_codes(huffman_tree, '', codes)
    compression_ratio = write_huffman_codes("table.txt", frequencies, codes)
    
    return compression_ratio

# 测试代码
file_path = "orignal.txt"
compression_ratio = compress_file(file_path)
print(f"压缩率: {compression_ratio:.2f}%")


压缩率: 43.89%


In [18]:
import heapq
import os
from collections import defaultdict


class HuffmanNode:
    def __init__(self, char=None, freq=0):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None

    def __lt__(self, other):
        return self.freq < other.freq


def get_character_frequency(file_path):
    frequency = defaultdict(int)
    with open(file_path, 'r') as file:
        for line in file:
            for char in line:
                if char.isalnum() or char in string.punctuation:
                    frequency[char] += 1
    return frequency


def build_huffman_tree(frequency):
    heap = []
    for char, freq in frequency.items():
        heapq.heappush(heap, HuffmanNode(char, freq))

    while len(heap) > 1:
        node1 = heapq.heappop(heap)
        node2 = heapq.heappop(heap)
        merged = HuffmanNode(freq=node1.freq + node2.freq)
        merged.left = node1
        merged.right = node2
        heapq.heappush(heap, merged)

    return heapq.heappop(heap)


def build_huffman_codes(node, current_code, codes):
    if node is None:
        return

    if node.char is not None:
        codes[node.char] = current_code
        return

    build_huffman_codes(node.left, current_code + '0', codes)
    build_huffman_codes(node.right, current_code + '1', codes)


def compress_file(file_path, codes):
    compressed_data = ""
    with open(file_path, 'r') as file:
        for line in file:
            for char in line:
                if char in codes:
                    compressed_data += codes[char]

    return compressed_data


def get_file_size(file_path):
    return os.stat(file_path).st_size


def write_table_file(codes, file_path):
    with open(file_path, 'w') as file:
        for char, code in codes.items():
            file.write(f"{char} {code}\n")


def calculate_compression_ratio(original_size, compressed_size):
    return (1 - compressed_size / original_size) * 100


def main():
    import string

    input_file = "orignal.txt"
    output_file = "table.txt"

    # 获取字符频率
    character_frequency = get_character_frequency(input_file)

    # 构建Huffman树
    huffman_tree = build_huffman_tree(character_frequency)

    # 构建Huffman编码
    huffman_codes = {}
    build_huffman_codes(huffman_tree, "", huffman_codes)

    # 压缩文件
    compressed_data = compress_file(input_file, huffman_codes)

    # 写入编码表
    write_table_file(huffman_codes, output_file)

    # 获取原始文件大小和压缩后文件大小
    original_size = get_file_size(input_file) * 8
    compressed_size = len(compressed_data)

    # 计算压缩率
    compression_ratio = calculate_compression_ratio(original_size, compressed_size)

    # 打印压缩率
    print(f"Compression ratio: {compression_ratio}%.")


if __name__ == '__main__':
    main()


Compression ratio: 54.76248768573411%.


In [19]:
import heapq
from collections import defaultdict


class HuffmanNode:
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None

    def __lt__(self, other):
        return self.freq < other.freq


def get_character_frequency(data):
    frequency = defaultdict(int)
    for char in data:
        frequency[char] += 1
    return frequency


def build_huffman_tree(frequency):
    heap = []
    for char, freq in frequency.items():
        heapq.heappush(heap, HuffmanNode(char, freq))

    while len(heap) > 1:
        node1 = heapq.heappop(heap)
        node2 = heapq.heappop(heap)
        merged = HuffmanNode(None, node1.freq + node2.freq)
        merged.left = node1
        merged.right = node2
        heapq.heappush(heap, merged)

    return heapq.heappop(heap)


def build_huffman_codes(node, current_code, codes):
    if node.char is not None:
        codes[node.char] = current_code
        return

    build_huffman_codes(node.left, current_code + '0', codes)
    build_huffman_codes(node.right, current_code + '1', codes)


def compress_data(data, codes):
    compressed_data = ""
    for char in data:
        if char in codes:
            compressed_data += codes[char]

    return compressed_data


def calculate_compression_ratio(original_data, compressed_data):
    original_bits = len(original_data) * 8
    compressed_bits = len(compressed_data)
    compression_ratio = (compressed_bits / original_bits) * 100
    return compression_ratio


def main():
    # 读取原始数据
    with open("orignal.txt", "r") as file:
        original_data = file.read().replace('\n', '')

    # 获取字符频率
    character_frequency = get_character_frequency(original_data)

    # 构建Huffman树
    huffman_tree = build_huffman_tree(character_frequency)

    # 构建Huffman编码
    huffman_codes = {}
    build_huffman_codes(huffman_tree, "", huffman_codes)

    # 压缩数据
    compressed_data = compress_data(original_data, huffman_codes)

    # 计算压缩率
    compression_ratio = calculate_compression_ratio(original_data, compressed_data)

    # 输出编码表
    with open("table.txt", "w") as file:
        for char, code in huffman_codes.items():
            file.write(f"{char} {code}\n")

    # 打印压缩率
    print(f"Compression ratio: {compression_ratio}%.")


if __name__ == '__main__':
    main()


Compression ratio: 55.06772645326227%.
