# Error Analysis

In [72]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [73]:
original_file = np.array(pd.read_csv('../storage/demo/berkeley_original.csv', header=None))
decompressed_file = np.array(pd.read_csv('../storage/demo/berkeley_demo.csv', header=None))
error_thr = 0.005

**No scaling error checking**

In [74]:
col_minimums = np.min(original_file, axis=0)
col_maximums = np.max(original_file, axis=0)

bins = [np.arange(minim, maxim, 2 * error_thr * (maxim - minim)) 
        for minim, maxim in zip(col_minimums, col_maximums)]

def quantize(arr, bins):
    quantized_cols = []
    
    for bin_col, col in zip(bins, arr.T):
        quantized_cols.append(np.digitize(col, bin_col))
        
    return np.array(quantized_cols).T

In [75]:
orig_quantized = quantize(original_file, bins)
decomp_quantized = quantize(decompressed_file, bins)
# The two matrices should exactly match

non_zero_values = np.sum((orig_quantized - decomp_quantized) != 0)
print(f"Percentage of non-zeros: {non_zero_values / (orig_quantized.shape[0] * orig_quantized.shape[1])}")

Percentage of non-zeros: 0.0


**With scaling error checking**

In [76]:
# Scaling
scaler = MinMaxScaler()
scaled_input = scaler.fit_transform(original_file)
scaled_decomp = scaler.transform(decompressed_file)

# Quantization
bins_scaled = np.arange(0, 1, 2 * error_thr)

orig_quantized_scaled = np.digitize(scaled_input, bins_scaled)
decomp_quantized_scaled = np.digitize(scaled_decomp, bins_scaled)

non_zero_values = np.sum((orig_quantized_scaled - decomp_quantized_scaled) != 0)
print(f"Percentage of non-zeros: {non_zero_values / (orig_quantized_scaled.shape[0] * orig_quantized_scaled.shape[1])}")

Percentage of non-zeros: 0.0


In [77]:
print(orig_quantized_scaled.shape)

(2219803, 8)
