# Preprocessing 

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

## Dataset Specific Preprocessing

### Monitor

[Monitor benchmark dataset](https://github.com/crottyan/mgbench) is used for benchmarking mostly database engines MySQL index creation. As a result the dataset is huge (~7GB) exceeding my hardware limits.

* Total remaining rows: 2,202,375
* Columns: 18

In [4]:
monitor = pd.read_csv('../storage/datasets/monitor_bench1.csv')

In [None]:
# Keep only 10% of the rows
monitor = monitor.iloc[:int(monitor.shape[0]*0.1), :]

# Drop the categorical columns
monitor = monitor.drop(['log_time', 'machine_name', 'machine_group'], axis=1)

# Drop remaining NaN rows
monitor = monitor.dropna()

# Make sure that we keep only numerical (float) types
print(monitor.dtypes)

# Store it without the default pandas index and without the column names
monitor.to_csv('../storage/datasets/monitor_preprocessed_0_1_fraction.csv', index=False, header=False)

### Berkeley

[Berkeley](https://www.kaggle.com/divyansh22/intel-berkeley-research-lab-sensor-data) contains data from 51 environmentanl sensors in the Berkeley Research lab between February 28th and April 5th, 2004.

* Total rows: 2,219,803
* Columns: 8

In [31]:
berkeley = pd.read_csv('../storage/datasets/berkeley.csv', sep=' ', header=None)

In [32]:
# Find NaNs and drop them (~4% of the dataset)
berkeley = berkeley.dropna()

In [33]:
# Transform date column from yyyy-mm-dd to days from the start of the measurements (28/2/2014)
year_dates = pd.to_datetime(berkeley.iloc[:, 0], format='%Y/%m/%d')
basedate = pd.Timestamp('2004-2-28')
days_elapsed = year_dates.apply(lambda x: (x - basedate).days)
berkeley[0] = days_elapsed

In [46]:
# Transform hour column from hh-mm-ss to days from the start of the measurements (28/2/2014)
hour = pd.to_datetime(berkeley.iloc[:, 1])
minutes_elapsed = ((hour - hour.dt.normalize()) / pd.Timedelta('1 minute')).astype(int)
berkeley[1] = minutes_elapsed

In [51]:
# Store it without the default pandas index and without the column names
berkeley.to_csv('../storage/datasets/berkeley_processed.csv', index=False, header=False)

### Corel

[Corel histogram dataset](https://kdd.ics.uci.edu/databases/CorelFeatures/CorelFeatures.data.html) consists of histogram color values of 68,040 photo images from various categories.


* Total rows: 68,040
* Columns: 32

In [60]:
corel = pd.read_csv('../storage/datasets/corel.csv', sep=' ', header=None, index_col=0)

# Corel does not require any dataset-specific preprocessing, we just remove its index when we store it
corel.to_csv('../storage/datasets/corel_preprocessed.csv', index=False, header=False)

## DeepSqueeze Preprocessing Pipeline

DeepSqueeze preprocessing consists of two steps:
1. Scaling in range [0, 1]
2. Quantization based on a user-defined threshold

In [90]:
def ds_preprocessing(x, error_threshold, min_val=0, max_val=1):
    # Scale in range [min_val, max_val]
    scaler = MinMaxScaler((min_val, max_val))
    processed = scaler.fit_transform(x)
    
    # Quantization
    bins = np.arange(min_val, max_val, 2*error_threshold)
    digitized = np.digitize(processed, bins)
    quantized = (digitized-1) * (2*error_threshold) + error_threshold
    
    return quantized

data_path = '../storage/datasets/monitor_0_1_fraction.csv'
arr = np.array(pd.read_csv(data_path, header=None))
proc = ds_preprocessing(arr, error_threshold=0.1, min_val=0, max_val=1)