In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
from src.logging import logger
logger.setLevel(logging.INFO)

# Tutorial 2: Reproducible Data
*"Raw Data is Read Only. Sing it with me"*


* RawDataset
  * Fetching + Unpack
      * Example 1: lvq-pak
      * exercise: fmnist
  * Attaching metadata
      * Example 2: lvq-pak
      * exercise: fmnist
  * Processing data
    * Process into data, (optionally, target)
    * create a process_my_dataset() function
        * Example 3: lvq-pak
        * Exercise: fmnist
  * Save the raw dataset to the raw dataset catalog
      * the workflow module
      * example: adding lvq-pak
      * exercise: fmnist

* Datasets and Data Transformers
    * Create a transformer to produce a `Dataset` from the RawDataset
    * Add this dataset to the catalog
    * Load the dataset. 
    * Verify we get the same dataset when doing raw.process(), Dataset.from_raw, and Dataset.load() (look at hash)
        * example: lvq-pak
        * exercise: fmnist_test, fmnist_train
    
    * More Complicated Transformers
        * Example: 80/20 Train/Test Split on lvq-pak
        * Exercise: merge labels on lvq-pak (hierarchial categories)
        * Exercise: merge labels on fmnist
        
* Punchline: 
  * delete all the files  (raw, interim, processed).
  * make clean_raw, clean_cache, clean_processed, (clean_data?) `make data`
  * look: same hashes as before

# TODO: Split this into 2 notebooks: lvq-pak and fmnist

## Introducing the `RawDataset`
The `RawDataset` object handles downloading, unpacking, and processing raw data files, and serves as a container for some basic metadata, including **documentation** and **license** information.



Raw data files are downloaded to  `paths.raw_data_path`.
 Cache files and unpacked raw files are saved to `paths.interim_data_path`.
    

### Fetching and Unpacking Raw Data

#### LVQ-Pak: A Finnish Phonetic dataset

The Learning Vector Quantization (lvq-pak) project includes a simple Finnish phonetic dataset
consisting 20-dimensional Mel Frequency Cepstrum Coefficients (MFCCs) labelled with target phoneme information. Our goal is to explore this dataset, process it into a useful form, and make it a part of a reproducible data science workflow. The project can be found at: http://www.cis.hut.fi/research/lvq_pak/



### Example: 
For this example, we are going create a `RawDataset` by:
1. Downloading and unpacking the raw data files. 
2. Generate (and record) hash values for these files.
2. Add relevant LICENSE and DESCR (description) metadata to this RawDataset

In [None]:
from src.data import RawDataset
from src.utils import list_dir

In [None]:
raw_ds = RawDataset('lvq-pak')

In [None]:
raw_ds.add_url("http://www.cis.hut.fi/research/lvq_pak/lvq_pak-3.1.tar")

In [None]:
raw_ds.add_url("http://www.cis.hut.fi/research/lvq_pak/README",
               file_name='lvq-pak.readme', name='DESCR')

In [None]:
license_txt = '''
************************************************************************
*                                                                      *
*                              LVQ_PAK                                 *
*                                                                      *
*                                The                                   *
*                                                                      *
*                   Learning  Vector  Quantization                     *
*                                                                      *
*                          Program  Package                            *
*                                                                      *
*                   Version 3.1 (April 7, 1995)                        *
*                                                                      *
*                          Prepared by the                             *
*                    LVQ Programming Team of the                       *
*                 Helsinki University of Technology                    *
*           Laboratory of Computer and Information Science             *
*                Rakentajanaukio 2 C, SF-02150 Espoo                   *
*                              FINLAND                                 *
*                                                                      *
*                      Copyright (c) 1991-1995                         *
*                                                                      *
************************************************************************
*                                                                      *
*  NOTE: This program package is copyrighted in the sense that it      *
*  may be used for scientific purposes. The package as a whole, or     *
*  parts thereof, cannot be included or used in any commercial         *
*  application without written permission granted by its producents.   *
*  No programs contained in this package may be copied for commercial  *
*  distribution.                                                       *
*                                                                      *
*  All comments concerning this program package may be sent to the     *
*  e-mail address 'lvq@nucleus.hut.fi'.                                *
*                                                                      *
************************************************************************
'''
raw_ds.add_metadata(contents=license_txt, kind='LICENSE')

In [None]:
logger.setLevel(logging.DEBUG)
raw_ds.fetch()

In [None]:
unpack_dir = raw_ds.unpack()

In [None]:
print(f'{unpack_dir}')
list_dir(unpack_dir)

In [None]:
workflow.add_raw_dataset(raw_ds)   # Add this raw dataset to the catalog
workflow.available_raw_datasets()

#### Exercise: Mark and F-MNIST
For this excercise, you are going to help Mark build a `RawDataset` out of the Fashion-MNIST files.

[Fashion-MNIST](https://github.com/zalandoresearch/fashion-mnist) is available from GitHub. Looking at the documentation there, we see that the raw data is distributed as a set of 4 files. The git repo specifies the checksums of these files:

| Name  | Content | Examples | Size | Link | MD5 Checksum|
| --- | --- |--- | --- |--- |--- |
| `train-images-idx3-ubyte.gz`  | training set images  | 60,000|26 MBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz)|`8d4fb7e6c68d591d4c3dfef9ec88bf0d`|
| `train-labels-idx1-ubyte.gz`  | training set labels  |60,000|29 KBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz)|`25c81989df183df01b3e8a0aad5dffbe`|
| `t10k-images-idx3-ubyte.gz`  | test set images  | 10,000|4.3 MBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz)|`bef4ecab320f06d8554ea6380940ec79`|
| `t10k-labels-idx1-ubyte.gz`  | test set labels  | 10,000| 5.1 KBytes | [Download](http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz)|`bb300cfdad3c16e7a12a480ee83cd310`|

Your mission is to build a `RawDataset` that downloads these raw files and verifies that the hash values are as expected. You should make sure to include metadata in this `RawDataset`, including **description** (DESCR) and **license** (LICENSE) inforation.

### Processing Raw Data
How do we turn raw data into something useful? There are 2 steps:
1. Write a function to extract meaningful `data` (and optionally, `target`) objects from your raw files, and
2. Wrap this function in the form of a **processing function**

#### Processing Function Template
A processing function is a function that 
* takes at least 2 keyword arguments as input: `dataset_name` (a string) and `metadata` (a dict).
* Returns a dictionary with the following keys: `dataset_name`, `data`, `target` (optional), and `metadata`
Here's a template:


In [None]:
def process_raw_data(dataset_name='raw_data', metadata=None):
    """Process a raw dataset object
    Parameters
    ----------
    dataset_name: (string)
        Name of this raw dataset. This will be used as a key for accessing this raw dataset in the
        Raw Dataset catalog
    metadata: dict or None
        If None, an empty metadata dictionary will be used.
    extract_func: function returning tuple: (data, target)
        Function to extract data and target

    Returns
    -------
    Dictionary containing the following keys:
        dataset_name: (string)
            `dataset_name` that was passed to the function
        metadata: (dict)
            dict containing the input `metadata` key/value pairs, and (optionally)
            additional information about this raw dataset
        data: array-style object
            Often a `numpy.ndarray` or `pandas.DataFrame`
        target: (optional) vector-style object
            for supervised learning problems, the target vector associated with `data`
    """
    if metadata is None:
        metadata = {}

    data, target = None, None

    # Generate `data` and `target` info
    #    data, target = extract_func()

    dset_opts = {
        'dataset_name': dataset_name,
        'metadata': metadata,
        'data': data,
        'target': target,
    }
    return dset_opts

#### Example: Processing lvq-pak data
Bjørn has successfully fetched and extracted the lvq-pak data. Now he is ready to process it into `data` and `target`.

In [None]:
raw_ds = RawDataset.from_name('lvq-pak')    # load it from the catalog
unpack_dir = raw_ds.unpack()                # Find the location of the unpacked files

In [None]:
list_dir(unpack_dir) # what's the extracted data look like?

In [None]:
list_dir(unpack_dir / 'lvq_pak-3.1')  # Files are extracted to a subdirectory:

In [None]:
datafile_train = unpack_dir / 'lvq_pak-3.1' / 'ex1.dat'
datafile_test = unpack_dir / 'lvq_pak-3.1' / 'ex2.dat'
datafile_train.exists() and datafile_test.exists()

In [None]:
from src.utils import head_file
print(head_file(datafile_train)) # number of data columns, followed by comment, then space-delimited data

In [None]:
print(head_file(datafile_test)) # similar, but no comment header

In [None]:
import pandas as pd

In [None]:
def read_space_delimited(filename, skiprows=None, class_labels=True):
    """Read an space-delimited file
    
    Data is space-delimited. Last column is the (string) label for the data

    Note: we can't use automatic comment detection, as `#` characters are also used as data labels.

    Parameters
    ----------
    skiprows: None or list
        list of rows to skip when reading the file.
    class_labels: boolean
        if true, the last column is treated as the class (target) label
    """
    with open(filename, 'r') as fd:
        df = pd.read_table(fd, skiprows=skiprows, skip_blank_lines=True,
                           comment=None, header=None, sep=' ', dtype=str)
        # targets are last column. Data is everything else
        if class_labels is True:
            target = df.loc[:, df.columns[-1]].values
            data = df.loc[:, df.columns[:-1]].values
        else:
            data = df.values
            target = np.zeros(data.shape[0])
        return data, target

In [None]:
data, target = read_space_delimited(datafile_train, skiprows=[0,1])
data.shape, target.shape

In [None]:
from src.paths import interim_data_path
import numpy as np

In [None]:
def process_lvq_pak(dataset_name='lvq-pak', metadata=None, kind='all'):
    """Process LVQ-data object
    Parameters
    ----------
    dataset_name: (string)
        Name of this raw dataset. This will be used as a key for accessing this raw dataset in the
        Raw Dataset catalog
    metadata: dict or None
        If None, an empty metadata dictionary will be used.
    extract_func: function returning tuple: (data, target)
        Function to extract data and target
    kind: {'train', 'test', 'all'}
        Whether to return training set, test set, or everything. 
        
    Returns
    -------
    Dictionary containing the following keys:
        dataset_name: (string)
            `dataset_name` that was passed to the function
        metadata: (dict)
            dict containing the input `metadata` key/value pairs, and (optionally)
            additional information about this raw dataset
        data: array-style object
            Often a `numpy.ndarray` or `pandas.DataFrame`
        target: (optional) vector-style object
            for supervised learning problems, the target vector associated with `data`
    """
    if metadata is None:
        metadata = {}

    untar_dir = interim_data_path / dataset_name
    unpack_dir = untar_dir / 'lvq_pak-3.1'

    if kind == 'train':
        data, target = read_space_delimited(unpack_dir / 'ex1.dat', skiprows=[0,1])
    elif kind == 'test':
        data, target = read_space_delimited(unpack_dir / 'ex2.dat', skiprows=[0])
    elif kind == 'all':
        data1, target1 = read_space_delimited(unpack_dir / 'ex1.dat', skiprows=[0,1])
        data2, target2 = read_space_delimited(unpack_dir / 'ex2.dat', skiprows=[0])
        data = np.vstack((data1, data2))
        target = np.append(target1, target2)
    else:
        raise Exception(f'Unknown kind: {kind}')

    dset_opts = {
        'dataset_name': dataset_name,
        'metadata': metadata,
        'data': data,
        'target': target,
    }
    return dset_opts

In [None]:
process_lvq_pak()

In [None]:
raw_ds.load_function = process_lvq_pak

In [None]:
ds = raw_ds.process() # Use the load_function to convert this RawDataset to a real Dataset

In [None]:
print(f"Built Dataset: {ds}")

In [None]:
ds = raw_ds.process(kind="test")  # Should be half the size
print(ds)

#### EXERCISE: Process Mark's F-MNIST Data
In the last exercise, you fetched and unpacked F-MNIST data.
Now it's time to process it into a usable dataset.

## The `Dataset` and Data Transformations

### Tour of the Dataset Object

### Creating a Simple Transformer

### More Complicated Transformers

## Reproducible Data: The Punchline