In [30]:
import numpy as np
import time
from tqdm.notebook import tqdm

## Part 1: Efficient mean & variance calculation

In [18]:
PATH_TO_FILE = './measurement_data_uint8.bin'

with open(PATH_TO_FILE, mode='rb') as file:
    data = file.read()

a. Naive approach

In [19]:
sumData = 0
for i in tqdm(data):
    sumData += i
naiveMean = sumData / len(data)
print(naiveMean)

  0%|          | 0/100000000 [00:00<?, ?it/s]

133.70021891


In [4]:
sumDiffSquare = 0
for i in tqdm(data):
    sumDiffSquare += (i - naiveMean)**2
naiveVariance = sumDiffSquare / len(data)
print(naiveVariance)

  0%|          | 0/100000000 [00:00<?, ?it/s]

1.8707706471619652


Side-notes: working with numpy and int array

In [5]:
intData = np.zeros(len(data))
for i in tqdm(range(0,len(data))):
    intData[i] = data[i]

  0%|          | 0/100000000 [00:00<?, ?it/s]

In [6]:
numpyMean = np.mean(intData)
print(numpyMean)

133.70021891


In [7]:
numpyVariance = np.var(intData)
print(numpyVariance)

1.87077064807842


b. Welford's algorithm 

In [20]:
def update(count, mean, M2, newValue):
    count += 1
    delta = newValue - mean
    mean += delta / count
    delta2 = newValue - mean
    M2 += delta * delta2
    return (count, mean, M2)

def finalize(count, mean, M2):
    if count < 2:
        return float("nan")
    else:
        (mean, variance, sampleVariance) = (mean, M2 / count, M2 / (count - 1))
        return (mean, variance, sampleVariance)

In [31]:
start = time.time()
count = 0
average = 0
M2 = 0

with open(PATH_TO_FILE, mode='rb') as f:
    while (byte := f.read(1)):
        intbyte = int.from_bytes(byte, "big")
        (count, average, M2) = update(count, average, M2, intbyte)

res = finalize(count,average,M2)

end = time.time()
print("time in seconds: ", end-start)

time in seconds:  -134.9626064300537
