In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
from src.logging import logger
logger.setLevel(logging.INFO)

## Turning a `DataSource` into a `Dataset`
How do we turn raw data into something useful? There are 2 steps:
1. Write a function to extract meaningful `data` (and optionally, `target`) objects from your raw files, and
2. Wrap this function in the form of a **processing function**


First, let's grab the dataset we created in the last notebook.


### Loading DataSets from the Catalog

In [None]:
from src import workflow
from src.data import DataSource

In [None]:
workflow.available_datasources()

In [None]:
dsrc = DataSource.from_name('lvq-pak')    # load it from the catalog
unpack_dir = dsrc.unpack()                # Find the location of the unpacked files

In [None]:
!ls -la $unpack_dir

### Processing Function Template
A processing function is a function that 
* takes at least 2 keyword arguments as input: `dataset_name` (a string) and `metadata` (a dict).
* Returns a dictionary with the following keys: `dataset_name`, `data`, `target` (optional), and `metadata`
Here's a template:

In [None]:
def process_raw_data(dataset_name='raw_data', metadata=None):
    """Process a raw dataset object
    Parameters
    ----------
    dataset_name: (string)
        Name of this raw dataset. This will be used as a key for accessing this raw dataset in the
        Raw Dataset catalog
    metadata: dict or None
        If None, an empty metadata dictionary will be used.
    extract_func: function returning tuple: (data, target)
        Function to extract data and target

    Returns
    -------
    Dictionary containing the following keys:
        dataset_name: (string)
            `dataset_name` that was passed to the function
        metadata: (dict)
            dict containing the input `metadata` key/value pairs, and (optionally)
            additional information about this raw dataset
        data: array-style object
            Often a `numpy.ndarray` or `pandas.DataFrame`
        target: (optional) vector-style object
            for supervised learning problems, the target vector associated with `data`
    """
    if metadata is None:
        metadata = {}

    data, target = None, None

    # Generate `data` and `target` info
    #    data, target = extract_func()

    dset_opts = {
        'dataset_name': dataset_name,
        'metadata': metadata,
        'data': data,
        'target': target,
    }
    return dset_opts

#### Example: Processing lvq-pak data
Bjørn has successfully fetched and extracted the lvq-pak data. Now he is ready to process it into `data` and `target`.

In [None]:
dsrc = DataSource.from_name('lvq-pak')    # load it from the catalog
unpack_dir = dsrc.unpack()                # Find the location of the unpacked files

In [None]:
list_dir(unpack_dir) # what's the extracted data look like?

In [None]:
list_dir(unpack_dir / 'lvq_pak-3.1')  # Files are extracted to a subdirectory:

In [None]:
datafile_train = unpack_dir / 'lvq_pak-3.1' / 'ex1.dat'
datafile_test = unpack_dir / 'lvq_pak-3.1' / 'ex2.dat'
datafile_train.exists() and datafile_test.exists()

In [None]:
from src.utils import head_file
print(head_file(datafile_train)) # number of data columns, followed by comment, then space-delimited data

In [None]:
print(head_file(datafile_test)) # similar, but no comment header

In [None]:
import pandas as pd

In [None]:
def read_space_delimited(filename, skiprows=None, class_labels=True):
    """Read an space-delimited file
    
    Data is space-delimited. Last column is the (string) label for the data

    Note: we can't use automatic comment detection, as `#` characters are also used as data labels.

    Parameters
    ----------
    skiprows: None or list
        list of rows to skip when reading the file.
    class_labels: boolean
        if true, the last column is treated as the class (target) label
    """
    with open(filename, 'r') as fd:
        df = pd.read_table(fd, skiprows=skiprows, skip_blank_lines=True,
                           comment=None, header=None, sep=' ', dtype=str)
        # targets are last column. Data is everything else
        if class_labels is True:
            target = df.loc[:, df.columns[-1]].values
            data = df.loc[:, df.columns[:-1]].values
        else:
            data = df.values
            target = np.zeros(data.shape[0])
        return data, target

In [None]:
data, target = read_space_delimited(datafile_train, skiprows=[0,1])
data.shape, target.shape

In [None]:
from src.paths import interim_data_path
import numpy as np

In [None]:
def process_lvq_pak(dataset_name='lvq-pak', metadata=None, kind='all'):
    """Process LVQ-data object
    Parameters
    ----------
    dataset_name: (string)
        Name of this raw dataset. This will be used as a key for accessing this raw dataset in the
        Raw Dataset catalog
    metadata: dict or None
        If None, an empty metadata dictionary will be used.
    extract_func: function returning tuple: (data, target)
        Function to extract data and target
    kind: {'train', 'test', 'all'}
        Whether to return training set, test set, or everything. 
        
    Returns
    -------
    Dictionary containing the following keys:
        dataset_name: (string)
            `dataset_name` that was passed to the function
        metadata: (dict)
            dict containing the input `metadata` key/value pairs, and (optionally)
            additional information about this raw dataset
        data: array-style object
            Often a `numpy.ndarray` or `pandas.DataFrame`
        target: (optional) vector-style object
            for supervised learning problems, the target vector associated with `data`
    """
    if metadata is None:
        metadata = {}

    untar_dir = interim_data_path / dataset_name
    unpack_dir = untar_dir / 'lvq_pak-3.1'

    if kind == 'train':
        data, target = read_space_delimited(unpack_dir / 'ex1.dat', skiprows=[0,1])
    elif kind == 'test':
        data, target = read_space_delimited(unpack_dir / 'ex2.dat', skiprows=[0])
    elif kind == 'all':
        data1, target1 = read_space_delimited(unpack_dir / 'ex1.dat', skiprows=[0,1])
        data2, target2 = read_space_delimited(unpack_dir / 'ex2.dat', skiprows=[0])
        data = np.vstack((data1, data2))
        target = np.append(target1, target2)
    else:
        raise Exception(f'Unknown kind: {kind}')

    dset_opts = {
        'dataset_name': dataset_name,
        'metadata': metadata,
        'data': data,
        'target': target,
    }
    return dset_opts

In [None]:
process_lvq_pak()

In [None]:
dsrc.load_function = process_lvq_pak

In [None]:
ds = dsrc.process() # Use the load_function to convert this DataSource to a real Dataset

In [None]:
print(f"Built Dataset: {ds}")

In [None]:
ds = dsrc.process(kind="test")  # Should be half the size
print(ds)

#### EXERCISE: Process Mark's F-MNIST Data
In the last exercise, you fetched and unpacked F-MNIST data.
Now it's time to process it into a usable dataset.

## The `Dataset` and Data Transformations

### Tour of the Dataset Object

### Creating a Simple Transformer

### More Complicated Transformers

## Reproducible Data: The Punchline