In [1]:
# 逐字节读文件，默认直接将8位数据扩充为16位(因为实验使用的码长度最大16位)，返回UInt16数组
# 如果存在压缩过程，则需要依据长度提取各码字
function readFile(path, codelen::Int64=8)::Vector{UInt16}
    array = Vector{UInt8}();
    open(path, "r") do io
        array = read(io);
    end
    if(codelen == 8)
        # 直接扩充为16位
        return UInt16.(array);
    else
        # 码字长为codelen比特时，默认一次读入codelen字节个数据，可解析出8个数据
        # 如果最后读入长度不足codelen字节，则忽略最后的补零数据
        len = length(array);
        carray = Vector{UInt16}();
        batchs = len ÷ codelen; # 可以读入codelen字节个数据的批数
        for i in 1:batchs
            # 读入一批数据
            batch = array[(i-1)*codelen+1:i*codelen];
            s = ""; # 转为二进制形式字符串并拼接
            for j in batch
                s = s * string(j, base=2, pad=8);
            end
            narray = zeros(UInt16, 1, 8); # 可读8个码字数据
            for j in 1:8
                narray[j] = parse(UInt16, s[(j-1)*codelen+1:j*codelen], base=2);
            end
            append!(carray, narray);
        end
        remainlen = len - batchs * codelen;
        if(remainlen > 0)
            batch = array[batchs*codelen+1:end];
            s = ""; 
            for j in batch
                s = s * string(j, base=2, pad=8);
            end
            readnum = remainlen*8÷codelen;
            narray = zeros(UInt16, 1, readnum);
            for j in 1:readnum
                narray[j] = parse(UInt16, s[(j-1)*codelen+1:j*codelen], base=2);
            end
            append!(carray, narray);
        end
        return carray;
    end
end

readFile (generic function with 2 methods)

In [2]:
# 写入编码文件，是读入数据的逆过程，将各种长度的码字数据拼接后同一转成比特字节流输出，长度不够在末尾补零
function writeFile(path, carray::Vector{UInt16}, codelen::Int64=8)
    len = length(carray);
    array = Vector{UInt8}(); # 要写入的字节数组
    batchs = len ÷ 8; # 可以写入8个码字的批数，每批codelen字节
    for i in 1:batchs
        batch = carray[(i-1)*8+1:i*8];
        s = ""; # 转为二进制形式字符串并拼接
        for j in batch
            s = s * string(j, base=2, pad=codelen);
        end
        narray = zeros(UInt8, 1, codelen); # 每批占用codelen字节
        for j in 1:codelen
            narray[j] = parse(UInt8, s[(j-1)*8+1:j*8], base=2);
        end
        append!(array, narray);
    end
    remainlen = len - batchs * 8;
    if(remainlen > 0)
        batch = carray[batchs*8+1:end];
        s = ""; 
        for j in batch
            s = s * string(j, base=2, pad=codelen);
        end
        writenum = Int(ceil(codelen*remainlen/8));
        s = s * repeat("0", 8*writenum-codelen*remainlen);
        narray = zeros(UInt8, 1, writenum);
        for j in 1:writenum
            narray[j] = parse(UInt8, s[(j-1)*8+1:j*8], base=2);
        end
        append!(array, narray);
    end
    open(path, "w") do io
        write(io, array);
    end
    return array;
end

writeFile (generic function with 2 methods)

In [3]:
# 字典容量固定的LZW编码方法
function LZWEncode(array::Vector{UInt16}, codelen::Int64, initiallen::Int64=8)
    # 每次读入1个符号，共256个符号，预先建立单符号映射
    # 在字典中每个符号以前缀索引+扩充符号形式表示
    maplist = [(prev=0,val=i) for i in 0:2^initiallen-1];
    ω = (prev=0,val=array[1]);
    carray = Vector{UInt16}();
    # 字典是否重置过的标志
    flag = false;
    for i in array[2:end]
        K = (prev=0,val=i);
        ωindex = findfirst(map(x->x==ω, maplist));
        ωK = (prev=ωindex,val=K.val);
        if ωK in maplist
            ω = ωK;
        else
            push!(carray, ωindex);
            # 如果字典已满则重置字典
            if length(maplist) == 2^codelen
                flag = true;
                maplist = [(prev=0,val=i) for i in 0:2^initiallen-1];
            else
                push!(maplist, ωK);
            end
            ω = K;
        end
    end
    push!(carray, findfirst(map(x->x==ω, maplist)));
    # 返回编码序列，以及字典达到的最多元素数
    return carray, flag ? 2^codelen : length(maplist);
end

LZWEncode (generic function with 2 methods)

In [4]:
# 通过前缀索引+扩充字符和字典还原对应的字符串
function constructstr(ωK::NamedTuple{(:prev,:val),Tuple{Int64,Int64}}, maplist::Vector{NamedTuple{(:prev,:val),Tuple{Int64,Int64}}})
    i, K = ωK;
    str = Vector{Int64}([K]);
    while i > 0
        i, K = maplist[i];
        str = pushfirst!(str,K);
    end
    return str;
end

constructstr (generic function with 1 method)

In [5]:
# 字典容量固定的LZW解码方法
function LZWDecode(carray::Vector{UInt16}, codelen::Int64, initiallen::Int64=8)
    rmaplist = [(prev=0,val=i) for i in 0:2^initiallen-1];
    c = carray[1];
    array = [rmaplist[c].val];
    flag = false;
    for i in carray[2:end]
        p = c;
        c = i;
        if c <= length(rmaplist)
            P = constructstr(rmaplist[p],rmaplist); C = constructstr(rmaplist[c],rmaplist);
            append!(array, C);
            if length(rmaplist) == 2^codelen
                rmaplist = [(prev=0,val=j) for j in 0:2^initiallen-1];
                flag = true;
            else
                push!(rmaplist,(prev=p,val=C[1]));
            end
        else
            P = constructstr(rmaplist[p],rmaplist); C = P[1];
            append!(array, P, C);
            if length(rmaplist) == 2^codelen
                rmaplist = [(prev=0,val=j) for j in 0:2^initiallen-1];
                flag = true;
            else
                push!(rmaplist,(prev=p,val=C));
            end
        end
    end
    return UInt16.(array), flag ? 2^codelen : length(rmaplist);
end

LZWDecode (generic function with 2 methods)

In [24]:
# 各类型数据压缩性能测试(有限容量字典)
function gettestinfo(files, type, codelen::Int64, readlen::Int64=8)
    info = [];
    for file in files
        if(match(r".lz|lz.",file) == nothing)
            # 默认逐字节读文件
            array = readFile("$type/$file", readlen);
            # 编码并将编码结果写入文件
            codearray, size = LZWEncode(array, codelen, readlen);
            writeFile("$type/$file.lz", codearray, codelen);
            # 读编码文件
            rarray = readFile("$type/$file.lz", codelen);
            # 解码验证
            dcodearray, = LZWDecode(codearray, codelen, readlen);
            check = isequal(array, dcodearray);
            writeFile("$type/lz.$file", dcodearray, readlen);
            push!(info,(name=file,
                        η=codelen*length(codearray)/length(array)/readlen,
                        dicmemory=size*(codelen+1),check=check));    
        end
    end
    return info;
end

gettestinfo (generic function with 2 methods)

In [7]:
type = ["Text","Audio","Image","Video"];

In [17]:
# 有限容量字典条件下测试
testgroup1 = [];
for i in type
    info = gettestinfo(readdir(i),"$i/",12);
    push!(testgroup1, info);
    for j in info
        println(j);
    end
end

(name = "Lorem Ipsum 10.docx", η = 1.2318113547611895, dicmemory = 53248, check = true)
(name = "Lorem Ipsum 10.txt", η = 0.5214736842105263, dicmemory = 35516, check = true)
(name = "Lorem Ipsum 100.docx", η = 1.363970647921165, dicmemory = 53248, check = true)
(name = "Lorem Ipsum 100.txt", η = 0.4435090049615189, dicmemory = 53248, check = true)
(name = "a100.docx", η = 1.183456038510548, dicmemory = 53248, check = true)
(name = "a100.txt", η = 0.21, dicmemory = 3497, check = true)
(name = "a1000.txt", η = 0.0212978702129787, dicmemory = 5161, check = true)
(name = "1-17367-A-10.mp3", η = 1.3934766763848396, dicmemory = 53248, check = true)
(name = "1-17367-A-10.ogg", η = 1.3874497014964626, dicmemory = 53248, check = true)
(name = "1-17367-A-10.wav", η = 1.3658535202836906, dicmemory = 53248, check = true)
(name = "1-23996-A-35.wav", η = 1.4038633333635646, dicmemory = 53248, check = true)
(name = "5-198411-F-20.wav", η = 1.3091440543371744, dicmemory = 53248, check = true)
(name =

In [18]:
# 改用不同容量字典测试
testgroup2 = [];
for i in type
    info = gettestinfo(readdir(i), "$i/", 10);
    push!(testgroup2, info);
    for j in info
        println(j);
    end
end

(name = "Lorem Ipsum 10.docx", η = 1.0649594472814659, dicmemory = 11264, check = true)
(name = "Lorem Ipsum 10.txt", η = 0.6214035087719298, dicmemory = 11264, check = true)
(name = "Lorem Ipsum 100.docx", η = 1.1765995097163824, dicmemory = 11264, check = true)
(name = "Lorem Ipsum 100.txt", η = 0.6129950124363532, dicmemory = 11264, check = true)
(name = "a100.docx", η = 1.0290421917032422, dicmemory = 11264, check = true)
(name = "a100.txt", η = 0.175, dicmemory = 2959, check = true)
(name = "a1000.txt", η = 0.01774822517748225, dicmemory = 4367, check = true)
(name = "1-17367-A-10.mp3", η = 1.2090925655976676, dicmemory = 11264, check = true)
(name = "1-17367-A-10.ogg", η = 1.1994056229124477, dicmemory = 11264, check = true)
(name = "1-17367-A-10.wav", η = 1.2232878125538496, dicmemory = 11264, check = true)
(name = "1-23996-A-35.wav", η = 1.2310789853166577, dicmemory = 11264, check = true)
(name = "5-198411-F-20.wav", η = 1.1753976165534643, dicmemory = 11264, check = true)
(na

In [19]:
# 改用不同容量字典测试
testgroup3 = [];
for i in type
    info = gettestinfo(readdir(i), "$i/", 14);
    push!(testgroup3, info);
    for j in info
        println(j);
    end
end

(name = "Lorem Ipsum 10.docx", η = 1.3370231300690898, dicmemory = 194580, check = true)
(name = "Lorem Ipsum 10.txt", η = 0.6083859649122807, dicmemory = 40980, check = true)
(name = "Lorem Ipsum 100.docx", η = 1.4649292985047973, dicmemory = 245760, check = true)
(name = "Lorem Ipsum 100.txt", η = 0.32727468062663595, dicmemory = 219240, check = true)
(name = "a100.docx", η = 1.3050049554013876, dicmemory = 161835, check = true)
(name = "a100.txt", η = 0.245, dicmemory = 4035, check = true)
(name = "a1000.txt", η = 0.02484751524847515, dicmemory = 5955, check = true)
(name = "1-17367-A-10.mp3", η = 1.4879251700680272, dicmemory = 245760, check = true)
(name = "1-17367-A-10.ogg", η = 1.4854956210714567, dicmemory = 245760, check = true)
(name = "1-17367-A-10.wav", η = 1.3264192461523114, dicmemory = 245760, check = true)
(name = "1-23996-A-35.wav", η = 1.4148191563653514, dicmemory = 245760, check = true)
(name = "5-198411-F-20.wav", η = 1.350933216188513, dicmemory = 245760, check = 

In [20]:
testgroup4 = [];
for i in type
    info = gettestinfo(readdir(i), "$i/", 16);
    push!(testgroup4, info);
    for j in info
        println(j);
    end
end

(name = "Lorem Ipsum 10.docx", η = 1.5280264343646741, dicmemory = 220524, check = true)
(name = "Lorem Ipsum 10.txt", η = 0.6952982456140351, dicmemory = 46444, check = true)
(name = "Lorem Ipsum 100.docx", η = 1.4605581440654578, dicmemory = 769046, check = true)
(name = "Lorem Ipsum 100.txt", η = 0.37402820643044105, dicmemory = 248472, check = true)
(name = "a100.docx", η = 1.491434234744443, dicmemory = 183413, check = true)
(name = "a100.txt", η = 0.28, dicmemory = 4573, check = true)
(name = "a1000.txt", η = 0.028397160283971604, dicmemory = 6749, check = true)
(name = "1-17367-A-10.mp3", η = 1.547424684159378, dicmemory = 545717, check = true)
(name = "1-17367-A-10.ogg", η = 1.5792020619098965, dicmemory = 514726, check = true)
(name = "1-17367-A-10.wav", η = 1.1829613371908472, dicmemory = 1114112, check = true)
(name = "1-23996-A-35.wav", η = 1.2766390654900646, dicmemory = 1114112, check = true)
(name = "5-198411-F-20.wav", η = 1.2955541287917187, dicmemory = 1114112, check 

In [22]:
# 按位读取的压缩
testgroupb1 = [];
for i in type
    info = gettestinfo(readdir(i),"$i/",12, 1);
    push!(testgroupb1, info);
    for j in info
        println(j);
    end
end

(name = "Lorem Ipsum 10.docx", η = 1.1669270051066387, dicmemory = 53248, check = true)
(name = "Lorem Ipsum 10.txt", η = 1.0968421052631578, dicmemory = 53248, check = true)
(name = "Lorem Ipsum 100.docx", η = 1.2473740604250207, dicmemory = 53248, check = true)
(name = "Lorem Ipsum 100.txt", η = 1.0354533734422002, dicmemory = 53248, check = true)
(name = "a100.docx", η = 1.1414059181650857, dicmemory = 53248, check = true)
(name = "a100.txt", η = 1.575, dicmemory = 1378, check = true)
(name = "a1000.txt", η = 0.16873312668733126, dicmemory = 14638, check = true)
(name = "lz.a100.txt", η = 1.575, dicmemory = 1378, check = true)
(name = "1-17367-A-10.mp3", η = 1.2756195335276967, dicmemory = 53248, check = true)
(name = "1-17367-A-10.ogg", η = 1.2747810535728374, dicmemory = 53248, check = true)
(name = "1-17367-A-10.wav", η = 1.2526403261352608, dicmemory = 53248, check = true)
(name = "1-23996-A-35.wav", η = 1.2704548752505418, dicmemory = 53248, check = true)
(name = "5-198411-F-20

In [25]:
# 按位读取的压缩
testgroupb2 = [];
for i in type
    info = gettestinfo(readdir(i),"$i/",8, 1);
    push!(testgroupb2, info);
    for j in info
        println(j);
    end
end

(name = "Lorem Ipsum 10.docx", η = 1.3192550315410032, dicmemory = 2304, check = true)
(name = "Lorem Ipsum 10.docx.lz", η = 1.4674165490231796, dicmemory = 2304, check = true)
(name = "Lorem Ipsum 10.txt", η = 1.415859649122807, dicmemory = 2304, check = true)
(name = "Lorem Ipsum 10.txt.lz", η = 1.4654044409199047, dicmemory = 2304, check = true)
(name = "Lorem Ipsum 100.docx", η = 1.4269201422147182, dicmemory = 2304, check = true)
(name = "Lorem Ipsum 100.docx.lz", η = 1.4650146767697454, dicmemory = 2304, check = true)
(name = "Lorem Ipsum 100.txt", η = 1.4157127788412704, dicmemory = 2304, check = true)
(name = "Lorem Ipsum 100.txt.lz", η = 1.4663888735581434, dicmemory = 2304, check = true)
(name = "a100.docx", η = 1.290457312756619, dicmemory = 2304, check = true)
(name = "a100.docx.lz", η = 1.4685939985737013, dicmemory = 2304, check = true)
(name = "a100.txt", η = 1.05, dicmemory = 954, check = true)
(name = "a100.txt.lz", η = 1.619047619047619, dicmemory = 1539, check = true

In [39]:
# 绘图包与统计包
using Plots, Statistics;
plotly();

└ @ Plots D:\Scoop\apps\julia\.julia\packages\Plots\dNEbX\src\Plots.jl:28


In [42]:
# 结论1附图
plot([i for i in 10:2:16],
    mean.([[.621,.613,.175,.018],[.521,.444,.21,.021],[.608,.327,.245,.025],[.695,.374,.28,.028]]),
    ylabel="η", label="对txt文件的平均压缩率", xlabel="码字长度"
)

In [48]:
# 结论2附图
x = [i for i in 10:2:16];
y = [mean.([[.175,.018],[.21,.021],[.245,.025],[.28,.028]])';
    .331 .27 .24 .22;.0036 .0043 .005 .0058;.657 .734 .755 .747];
plot(x,y',ylabel="平均压缩率", label=["文本" "音频" "图像" "视频"], xlabel="码字长度")

In [49]:
# 结论3附图
x = [i for i in 10:2:16];
y = [1.09 1.26 1.369 1.493;1.207 1.372 1.413 1.376;1.167 1.277 1.294 1.327;1.174 1.362 1.442 1.38];
plot(x,y',ylabel="平均压缩率", label=["文本" "音频" "图像" "视频"], xlabel="码字长度")