In [1]:
import numpy as np
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import subprocess
import time
import pandas as pd
import os

### Read the data

In [2]:
with open("webpacks/webpack_bodies.json") as file:
    data = file.read()
    
splitted_data = data.split('{"page"')[1:]
splitted_data = [json.loads('{"page"' + line)["body"] for line in splitted_data]

for i in range(50):
    name = "0" * (12 - len(str(i))) + str(i)
    with open("webpacks/webpack_bodies_" + name + ".json") as file:
        data = file.read().split('{"page"')[1:]
        splitted_data += [json.loads('{"page"' + line)["body"] for line in data]

## Perform compression for bundled approach

In [17]:
def log(file, msg):
    f = open(file, 'a+')
    f.write(msg + '\n')
    f.close()
    
def get_seconds(time): 
    min_ind = time.find('m')
    mins = int(time[:min_ind])
    second = float(time[min_ind + 1:-1])
    return mins * 60 + second

In [219]:
rates_gzip_bundled = []
rates_brotli_bundled = []
times_gzip_bundled = []
times_brotli_bundled = []
speed_gzip_bundled = []
speed_brotli_bundled = []
init_sizes_bundled = []
for i in range(4000):
    rates_gzip_compressed = []
    rates_brotli_compressed = []
    times_gzip_compressed = []
    times_brotli_compressed = []
    speed_gzip_compressed = []
    speed_brotli_compressed = []

    # write the text of a bundle to file to use it for compression later
    with open("example.txt", "w") as file:
        file.write(splitted_data[i])
    size_non_compressed = os.stat("example.txt").st_size
    init_sizes_bundled.append(size_non_compressed)

    # do the gzip compression with different levels
    for level in range(4, 10):
        result = subprocess.run(["bash", "gzip_compress.sh", str(level), "time.txt", 
                                 "example_gzip.txt.gz", "example.txt"])
        #previous script saves the time into the file
        with open("time.txt") as file:
            user_sys = file.read().strip().split('\n')[1:]
        time = get_seconds(user_sys[0].split('\t')[1]) + get_seconds(user_sys[1].split('\t')[1])
        size_gzip_compressed = os.stat("example_gzip.txt.gz").st_size
        rates_gzip_compressed.append(size_non_compressed / size_gzip_compressed)
        times_gzip_compressed.append(time)
        speed_gzip_compressed.append(size_non_compressed / time)

    # do the brotli compression with different levels
    for level in range(4, 12):
        result = subprocess.run(["bash", "brotli_compress.sh", str(level), "time.txt", 
                                 "example_brotli.txt.br", "example.txt"])
        with open("time.txt") as file:
            user_sys = file.read().strip().split('\n')[1:]
        time = get_seconds(user_sys[0].split('\t')[1]) + get_seconds(user_sys[1].split('\t')[1])
        size_br_compressed = os.stat("example_brotli.txt.br").st_size
        rates_brotli_compressed.append(size_non_compressed / size_br_compressed)
        times_brotli_compressed.append(time)
        speed_brotli_compressed.append(size_non_compressed / time)
    
    rates_gzip_bundled.append(rates_gzip_compressed)
    rates_brotli_bundled.append(rates_brotli_compressed)
    times_gzip_bundled.append(times_gzip_compressed)
    times_brotli_bundled.append(times_brotli_compressed)
    speed_gzip_bundled.append(speed_gzip_compressed)
    speed_brotli_bundled.append(speed_brotli_compressed)
    
    if i != 0 and i % 50 == 0:
        log("logs.txt", "rates_gzip: " + str(np.mean(rates_gzip_bundled, axis=0)))
        log("logs.txt", "rates_brotli: " + str(np.mean(rates_brotli_bundled, axis=0)))
        log("logs.txt", "times_gzip: " + str(np.mean(times_gzip_bundled, axis=0)))
        log("logs.txt", "times_brotli: " + str(np.mean(times_brotli_bundled, axis=0)))
        log("logs.txt", "speed_gzip: " + str(np.mean(speed_gzip_bundled, axis=0)))
        log("logs.txt", "speed_brotli: " + str(np.mean(speed_brotli_bundled, axis=0)))

In [408]:
frame = pd.DataFrame()
frame["name"] = ["gzip 4", "gzip 5", "gzip 6", "gzip 7", "gzip 8", "gzip 9",
                 "brotli 4", "brotli 5", "brotli 6", "brotli 7", "brotli 8", "brotli 9", "brotli 10", "brotli 11"]

frame["rates"] = np.hstack((np.mean(rates_gzip_bundled, axis=0), np.mean(rates_brotli_bundled, axis=0)))
frame["savings"] = 1 - 1 / np.hstack((np.mean(rates_gzip, axis=0), np.mean(rates_brotli_bundled, axis=0)))
frame["speed(MB/s)"] = np.hstack((np.mean(speed_gzip_bundled, axis=0), np.mean(speed_brotli_bundled, axis=0))) / 1000000

frame

Unnamed: 0,name,rates,savings,speed(MB/s)
0,gzip 4,3.260182,0.726951,20.650285
1,gzip 5,3.357855,0.736465,17.32077
2,gzip 6,3.392349,0.739789,14.828584
3,gzip 7,3.402581,0.740678,13.650937
4,gzip 8,3.409667,0.741257,11.943546
5,gzip 9,3.41015,0.74129,11.749338
6,brotli 4,3.460357,0.711012,16.654836
7,brotli 5,3.714279,0.730769,11.635901
8,brotli 6,3.745654,0.733024,10.27009
9,brotli 7,3.771763,0.734872,7.60185


## Unbundled approach

In [214]:
rates_gzip_unbundled = []
rates_brotli_unbundled = []
times_gzip_unbundled = []
times_brotli_unbundled = []
speed_gzip_unbundled = []
speed_brotli_unbundled = []
init_sizes_unbundled = []

for i in range(600):
    # write the text of a bundle to file to use it for getting chunks from bundle later
    with open("bundle_analyzer/text_bundle.txt", "w") as file:
        file.write(splitted_data[i])
    try:
        # save chunks from bundle to parsed_bundle.json file
        result = subprocess.run(["node", "--experimental-modules", "bundle_analyzer/get_chunks.js"])
    except:
        continue
    # get chunks
    with open("parsed_bundle.json") as file:
        codes = [line['code'] for line in json.loads(file.read())]

    sizes_gzip_compressed = np.zeros(6)
    sizes_brotli_compressed = np.zeros(8)
    times_gzip_compressed = np.zeros(6)
    times_brotli_compressed = np.zeros(8)
    overall_init_size = 0

    for code in codes:
        if not code:
            continue
        # write the text of a bundle to file to use it for compression later
        with open("example.txt", "w") as file:
            file.write(code)
        overall_init_size += os.stat("example.txt").st_size

        # do the gzip compression with different levels
        for level in range(4, 10):
            result = subprocess.run(["bash", "gzip_compress.sh", str(level), "time.txt", 
                                     "example_gzip.txt.gz", "example.txt"])
            with open("time.txt") as file:
                user_sys = file.read().strip().split('\n')[1:]
            time = get_seconds(user_sys[0].split('\t')[1]) + get_seconds(user_sys[1].split('\t')[1])
            sizes_gzip_compressed[level - 4] += os.stat("example_gzip.txt.gz").st_size
            times_gzip_compressed[level - 4] += time

        # do the brotli compression with different levels
        for level in range(4, 12):
            result = subprocess.run(["bash", "brotli_compress.sh", str(level), "time.txt", 
                                     "example_brotli.txt.br", "example.txt"])
            with open("time.txt") as file:
                user_sys = file.read().strip().split('\n')[1:]
            time = get_seconds(user_sys[0].split('\t')[1]) + get_seconds(user_sys[1].split('\t')[1])
            sizes_brotli_compressed[level - 4] += os.stat("example_brotli.txt.br").st_size
            times_brotli_compressed[level - 4] += time

    rates_gzip_unbundled.append(overall_init_size / sizes_gzip_compressed)
    rates_brotli_unbundled.append(overall_init_size / sizes_brotli_compressed)
    times_gzip_unbundled.append(times_gzip_compressed)
    times_brotli_unbundled.append(times_brotli_compressed)
    speed_gzip_unbundled.append(overall_init_size / times_gzip_compressed)
    speed_brotli_unbundled.append(overall_init_size / times_brotli_compressed)
    init_sizes_unbundled.append(overall_init_size)
    
    if i != 0 and i % 100 == 0:
        log("logs2.txt", "rates_gzip: " + str(np.mean(rates_gzip_unbundled, axis=0)))
        log("logs2.txt", "rates_brotli: " + str(np.mean(rates_brotli_unbundled, axis=0)))
        log("logs2.txt", "times_gzip: " + str(np.mean(times_gzip_unbundled, axis=0)))
        log("logs2.txt", "times_brotli: " + str(np.mean(times_brotli_unbundled, axis=0)))
        log("logs2.txt", "speed_gzip: " + str(np.mean(speed_gzip_unbundled, axis=0)))
        log("logs2.txt", "speed_brotli: " + str(np.mean(speed_brotli_unbundled, axis=0)))

In [419]:
frame = pd.DataFrame()
frame["name"] = ["gzip 4", "gzip 5", "gzip 6", "gzip 7", "gzip 8", "gzip 9",
                 "brotli 4", "brotli 5", "brotli 6", "brotli 7", "brotli 8", "brotli 9", "brotli 10", "brotli 11"]

frame["rates_bundled"] = np.hstack((np.mean(rates_gzip_bundled[:600], axis=0), 
                                    np.mean(rates_brotli_bundled[:600], axis=0)))
frame["savings_bundled"] = 1 - 1 / np.hstack((np.mean(rates_gzip_bundled[:600], axis=0), 
                                              np.mean(rates_brotli_bundled[:600], axis=0)))
frame["speed_bundled(MB/s)"] = np.hstack((np.mean(speed_gzip_bundled[:600], axis=0), 
                                          np.mean(speed_brotli_bundled[:600], axis=0))) / 1000000

frame["rates_unbundled"] = np.hstack((np.mean(rates_gzip_unbundled, axis=0), 
                                      np.mean(rates_brotli_unbundled, axis=0)))
frame["savings_unbundled"] = 1 - 1 / np.hstack((np.mean(rates_gzip_unbundled, axis=0), 
                                                np.mean(rates_brotli_unbundled, axis=0)))
frame["speed_unbundled(MB/s)"] = np.hstack((np.mean(speed_gzip_unbundled, axis=0), 
                                            np.mean(speed_brotli_unbundled, axis=0))) / 1000000
frame

Unnamed: 0,name,rates_bundled,savings_bundled,speed_bundled(MB/s),rates_unbundled,savings_unbundled,speed_unbundled(MB/s)
0,gzip 4,3.352383,0.701705,20.010897,2.714575,0.631618,2.922456
1,gzip 5,3.4549,0.710556,17.20105,2.770296,0.639028,2.792084
2,gzip 6,3.491963,0.713628,14.601151,2.792162,0.641855,2.574496
3,gzip 7,3.502496,0.714489,13.574471,2.798391,0.642652,2.45116
4,gzip 8,3.510398,0.715132,11.716914,2.804235,0.643397,2.200946
5,gzip 9,3.510933,0.715175,11.625863,2.804699,0.643455,2.13814
6,brotli 4,3.565088,0.719502,16.478388,2.903769,0.65562,2.062052
7,brotli 5,3.829246,0.738852,11.519678,3.112397,0.678704,1.802857
8,brotli 6,3.863344,0.741157,10.184582,3.128816,0.68039,1.697186
9,brotli 7,3.892121,0.743071,7.572282,3.145081,0.682043,1.49576


### Group results by non compressed size ranges

In [404]:
# ranges are (20000, 100000), (100000, 1000000), (1000000, 3000000) in bytes
init_sizes_unbundled = np.array(init_sizes_unbundled)
group1 = np.where((init_sizes_unbundled > 2000)*(init_sizes_unbundled <= 100000))[0]
group2 = np.where((init_sizes_unbundled > 100000)*(init_sizes_unbundled <= 1000000))[0]
group3 = np.where((init_sizes_unbundled > 1000000)*(init_sizes_unbundled <= 3000000))[0]

print(20000, "-", 100000, "bytes")
frame = pd.DataFrame()
frame["name"] = ["gzip 4", "gzip 5", "gzip 6", "gzip 7", "gzip 8", "gzip 9",
                 "brotli 4", "brotli 5", "brotli 6", "brotli 7", "brotli 8", "brotli 9", "brotli 10", "brotli 11"]


frame["rates_bundled"] = np.hstack((np.mean(np.array(rates_gzip_bundled)[group1], axis=0), 
                                    np.mean(np.array(rates_brotli_bundled)[group1], axis=0)))
frame["savings_bundled"] = 1 - 1 / np.hstack((np.mean(np.array(rates_gzip_bundled)[group1], axis=0), 
                                              np.mean(np.array(rates_brotli_bundled)[group1], axis=0)))
frame["speed_bundled(MB/s)"] = np.hstack((np.mean(np.array(speed_gzip_bundled)[group1], axis=0), 
                                           np.mean(np.array(speed_brotli_bundled)[group1], axis=0))) / 1000000

frame["rates_unbundled"] = np.hstack((np.mean(np.array(rates_gzip_unbundled)[group1], axis=0), 
                                    np.mean(np.array(rates_brotli_unbundled)[group1], axis=0)))
frame["savings_unbundled"] = 1 - 1 / np.hstack((np.mean(np.array(rates_gzip_unbundled)[group1], axis=0), 
                                              np.mean(np.array(rates_brotli_unbundled)[group1], axis=0)))
frame["speed_unbundled(MB/s)"] = np.hstack((np.mean(np.array(speed_gzip_unbundled)[group1], axis=0), 
                                           np.mean(np.array(speed_brotli_unbundled)[group1], axis=0))) / 1000000

frame

20000 - 100000 bytes


Unnamed: 0,name,rates_bundled,savings_bundled,speed_bundled(MB/s),rates_unbundled,savings_unbundled,speed_unbundled(MB/s)
0,gzip 4,3.219473,0.68939,19.906293,2.639814,0.621185,2.741001
1,gzip 5,3.316049,0.698436,16.87547,2.688858,0.628095,2.636272
2,gzip 6,3.350974,0.701579,14.277811,2.703293,0.630081,2.391124
3,gzip 7,3.360854,0.702457,13.101073,2.708538,0.630797,2.357168
4,gzip 8,3.367366,0.703032,11.325969,2.713746,0.631506,2.101626
5,gzip 9,3.367827,0.703073,11.248899,2.714211,0.631569,2.081291
6,brotli 4,3.414766,0.707154,16.201073,2.794675,0.642177,1.889179
7,brotli 5,3.663572,0.727042,11.228977,2.98919,0.665461,1.65831
8,brotli 6,3.697175,0.729523,9.944989,3.0011,0.666789,1.575014
9,brotli 7,3.725454,0.731576,7.32478,3.012006,0.667995,1.380246


In [406]:
print(100000, "-", 1000000, "bytes")
frame = pd.DataFrame()
frame["name"] = ["gzip 4", "gzip 5", "gzip 6", "gzip 7", "gzip 8", "gzip 9",
                 "brotli 4", "brotli 5", "brotli 6", "brotli 7", "brotli 8", "brotli 9", "brotli 10", "brotli 11"]
frame["rates_bundled"] = np.hstack((np.mean(np.array(rates_gzip_bundled)[group2], axis=0), 
                                    np.mean(np.array(rates_brotli_bundled)[group2], axis=0)))
frame["savings_bundled"] = 1 - 1 / np.hstack((np.mean(np.array(rates_gzip_bundled)[group2], axis=0), 
                                              np.mean(np.array(rates_brotli_bundled)[group2], axis=0)))
frame["speed_bundled(MB/s)"] = np.hstack((np.mean(np.array(speed_gzip_bundled)[group2], axis=0), 
                                           np.mean(np.array(speed_brotli_bundled)[group2], axis=0))) / 1000000

frame["rates_unbundled"] = np.hstack((np.mean(np.array(rates_gzip_unbundled)[group2], axis=0), 
                                    np.mean(np.array(rates_brotli_unbundled)[group2], axis=0)))
frame["savings_unbundled"] = 1 - 1 / np.hstack((np.mean(np.array(rates_gzip_unbundled)[group2], axis=0), 
                                              np.mean(np.array(rates_brotli_unbundled)[group2], axis=0)))
frame["speed_unbundled(MB/s)"] = np.hstack((np.mean(np.array(speed_gzip_unbundled)[group2], axis=0), 
                                           np.mean(np.array(speed_brotli_unbundled)[group2], axis=0))) / 1000000

frame

100000 - 1000000 bytes


Unnamed: 0,name,rates_bundled,savings_bundled,speed_bundled(MB/s),rates_unbundled,savings_unbundled,speed_unbundled(MB/s)
0,gzip 4,3.255438,0.692822,21.709783,2.834153,0.647161,3.442193
1,gzip 5,3.352408,0.701707,18.111059,2.901749,0.65538,3.270738
2,gzip 6,3.385023,0.704581,15.608381,2.937161,0.659535,3.016377
3,gzip 7,3.395387,0.705483,14.415784,2.945246,0.66047,2.766044
4,gzip 8,3.402633,0.70611,12.668371,2.952941,0.661355,2.463421
5,gzip 9,3.403091,0.70615,12.492436,2.953505,0.661419,2.303074
6,brotli 4,3.461648,0.71112,17.202166,3.086969,0.676058,2.547228
7,brotli 5,3.710503,0.730495,11.96993,3.321406,0.698923,2.191857
8,brotli 6,3.740765,0.732675,10.554053,3.346479,0.701179,2.032891
9,brotli 7,3.764571,0.734365,7.812185,3.373016,0.703529,1.782097


In [407]:
print(1000000, "-", 3000000, "bytes")
frame = pd.DataFrame()
frame["name"] = ["gzip 4", "gzip 5", "gzip 6", "gzip 7", "gzip 8", "gzip 9",
                 "brotli 4", "brotli 5", "brotli 6", "brotli 7", "brotli 8", "brotli 9", "brotli 10", "brotli 11"]
frame["rates_bundled"] = np.hstack((np.mean(np.array(rates_gzip_bundled)[group3], axis=0), 
                                    np.mean(np.array(rates_brotli_bundled)[group3], axis=0)))
frame["savings_bundled"] = 1 - 1 / np.hstack((np.mean(np.array(rates_gzip_bundled)[group3], axis=0), 
                                              np.mean(np.array(rates_brotli_bundled)[group3], axis=0)))
frame["speed_bundled(MB/s)"] = np.hstack((np.mean(np.array(speed_gzip_bundled)[group3], axis=0), 
                                           np.mean(np.array(speed_brotli_bundled)[group3], axis=0))) / 1000000

frame["rates_unbundled"] = np.hstack((np.mean(np.array(rates_gzip_unbundled)[group3], axis=0), 
                                    np.mean(np.array(rates_brotli_unbundled)[group3], axis=0)))
frame["savings_unbundled"] = 1 - 1 / np.hstack((np.mean(np.array(rates_gzip_unbundled)[group3], axis=0), 
                                              np.mean(np.array(rates_brotli_unbundled)[group3], axis=0)))
frame["speed_unbundled(MB/s)"] = np.hstack((np.mean(np.array(speed_gzip_unbundled)[group3], axis=0), 
                                           np.mean(np.array(speed_brotli_unbundled)[group3], axis=0))) / 1000000

frame

1000000 - 3000000 bytes


Unnamed: 0,name,rates_bundled,savings_bundled,speed_bundled(MB/s),rates_unbundled,savings_unbundled,speed_unbundled(MB/s)
0,gzip 4,3.207758,0.688256,22.311194,2.806185,0.643644,1.718255
1,gzip 5,3.298249,0.696809,19.365311,2.861184,0.650494,1.701731
2,gzip 6,3.327129,0.699441,15.542296,2.875304,0.652211,1.687635
3,gzip 7,3.334811,0.700133,13.912655,2.88137,0.652943,1.516255
4,gzip 8,3.338078,0.700426,12.395455,2.884158,0.653278,1.667812
5,gzip 9,3.338087,0.700427,12.474473,2.884215,0.653285,1.642354
6,brotli 4,3.389592,0.704979,18.884871,2.948077,0.660796,1.000014
7,brotli 5,3.631806,0.724655,12.310412,3.142986,0.681831,1.002287
8,brotli 6,3.662058,0.726929,10.67777,3.153725,0.682915,0.976053
9,brotli 7,3.687079,0.728783,7.82411,3.164575,0.684002,0.983152


### Compare the results for each example

In [389]:
ratio_of_rates = []
ratio_of_times = []
for i in range(len(rates_gzip_unbundled)):
    ratio_of_rates.append(np.hstack((np.array(rates_gzip_bundled[i]) / np.array(rates_gzip_unbundled[i]),
                                     np.array(rates_brotli_bundled[i]) / np.array(rates_brotli_unbundled[i]))))
    ratio_of_times.append(np.hstack((np.array(times_gzip_bundled[i]) / np.array(times_gzip_unbundled[i]),
                                     np.array(times_brotli_bundled[i]) / np.array(times_brotli_unbundled[i]))))
    
frame = pd.DataFrame()
frame["name"] = ["gzip 4", "gzip 5", "gzip 6", "gzip 7", "gzip 8", "gzip 9",
                 "brotli 4", "brotli 5", "brotli 6", "brotli 7", "brotli 8", "brotli 9", "brotli 10", "brotli 11"]
frame["ratio of rates"] = np.mean(ratio_of_rates, axis=0)
frame["ratio of times"] = np.mean(ratio_of_times, axis=0)
frame

Unnamed: 0,name,ratio of rates,ratio of times
0,gzip 4,1.427097,0.632208
1,gzip 5,1.453109,0.749165
2,gzip 6,1.464445,0.894621
3,gzip 7,1.467382,0.906732
4,gzip 8,1.469844,1.163363
5,gzip 9,1.470023,1.15278
6,brotli 4,1.393734,0.490474
7,brotli 5,1.409129,0.684028
8,brotli 6,1.418572,0.757256
9,brotli 7,1.426384,1.009216
