In [None]:
# hide
# default_exp core.block_types
import os
from nbdev.showdoc import *
if not os.path.exists('settings.ini'):
    os.chdir('..')

# Block types

> Types of blocks

In [None]:
#export
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import joblib
import pickle
from pathlib import Path
from IPython.display import display
from typing import Optional

try:
    from graphviz import *
    imported_graphviz = True
except:
    imported_graphviz = False

# block_types
from block_types.core.data_conversion import DataConverter, PandasConverter
from block_types.core.utils import save_csv, save_parquet, save_multi_index_parquet, save_keras_model, save_csv_gz, read_csv, read_csv_gz
from block_types.core.utils import DataIO, SklearnIO, NoSaverIO, ModelPlotter
from block_types.core.utils import camel_to_snake
from block_types.config import defaults as dflt
from block_types.utils.utils import set_logger

## Component

In [None]:
#export

class Component (ClassifierMixin, TransformerMixin, BaseEstimator):
    """Base component class used in our Pipeline."""
    def __init__ (self,
                  estimator=None,
                  name: Optional[str] = None,
                  data_converter: Optional[DataConverter] = None,
                  data_io: Optional[DataIO] = None,
                  model_plotter: Optional[ModelPlotter] = None,
                  logger=None,
                  verbose: int = 0,
                  **kwargs):
        
        """
        Initialize attributes and fields.
        
        Parameters
        ----------
        estimator : estimator (classifier or transformer) or None, optional
            Estimator being wrapped.
        name : Pipeline or None, optional
            Name of component. If not provided, it is inferred from the name of the 
            estimator's class, or the name of the custom class defining the componet.
        data_converter : DataConverter or None, optional
            Converts incoming data to format expected by component, and convert 
            outgoing result to format expected by caller.
        data_io : DataIO or None, optional
            Manages data serialization and deserialization.
        model_plotter : ModelPlotter or None, optional
            Helper object that allows to retrieve information to be shown about this 
            component, as part of a Pipeline diagram.
        logger : logging.logger or None, optional
            Logger used to write messages
        verbose : int, optional
            Verbosity, 0: warning or critical, 1: info, 2: debug.
        """

        # logger used to display messages
        if logger is None:
            self.logger = set_logger ('block_types', verbose=verbose)
        else:
            self.logger = logger

        # name of current component, for logging and plotting purposes
        self._determine_component_name (name, estimator)

        # object that manages loading / saving
        if data_io is None:
            self.data_io = DataIO (component=self, **kwargs)
        else:
            self.data_io = data_io
            self.data_io.setup (self)

        # estimator (ML model)
        self.estimator = estimator

        # data converter
        if data_converter is None:
            self.data_converter = PandasConverter (**kwargs)
        else:
            self.data_converter = data_converter

        # plotting model component
        if model_plotter is None:
            self.model_plotter = ModelPlotter (component=self, **kwargs)
        else:
            self.model_plotter = model_plotter
            self.model_plotter.set_component (self)

    def _determine_component_name (self, name: Optional[str], estimator) -> None:
        """
        Determines an appropriate name for the component if not provided by input.
        
        If not provided, it is inferred from the name of the estimator's class, or 
        the name of the custom class defining the componet.
        """
        self.class_name = self.__class__.__name__
        if (self.class_name in __all__) and (estimator is not None):
            self.class_name = estimator.__class__.__name__

        if name is not None:
            self.name = name
        else:
            self.name = camel_to_snake (self.class_name)

    def fit (self, X, y=None, load=True, save=True):
        """
        Estimates the parameters of the component based on given data X and labels y.
        
        Uses the previously fitted parameters if they're found in disk and overwrite 
        is False.
        """
        self.logger.info (f'fitting {self.name}')
        
        previous_estimator = None
        if load and not self.data_io.overwrite:
            previous_estimator = self.data_io.load_estimator()
            
        if previous_estimator is None:
            X, y = self.data_converter.convert_before_fitting (X, y)
            self._fit (X, y)
            self.data_converter.convert_after_fitting (X)
            if save:
                self.data_io.save_estimator ()
        else:
            self.estimator = previous_estimator
            self.logger.info (f'loaded pre-trained {self.name}')
        return self

    def transform (self, X, load=True, save=True):
        """
        Transforms the data X and returns the transformed data.
        
        Uses the previously transformed data if it's found in disk and overwrite 
        is False.
        """
        self.logger.info (f'applying {self.name} transform')
        result= self._compute_result (X, self._transform, load=load, save=save)
        return result

    def predict (self, X, load=True, save=True):
        """
        Predicts binary labels and returns result.
        
        Uses previously stored predictions if found in disk and overwrite is False.
        """
        self.logger.info (f'applying {self.name} inference')
        return self._compute_result (X, self._predict, new_columns=['prediction'], load=load, save=save)

    def _compute_result (self, X, result_func, load=True, save=True, **kwargs):
        previous_result = None
        if load and not self.data_io.overwrite:
            previous_result = self.data_io.load_result()
        if previous_result is None:
            X = self.data_converter.convert_before_transforming (X, **kwargs)
            result = result_func (X)
            result = self.data_converter.convert_after_transforming (result, **kwargs)
            if save:
                self.data_io.save_result (result)
        else:
            result = previous_result
            self.logger.info (f'loaded pre-computed result')
        return result


    def _fit (self, X, y=None):
        if self.estimator is not None:
            self.estimator.fit (X, y)

    def _transform (self, X):
        if self.estimator is not None:
            return self.estimator.transform (X)
        else:
            raise NotImplementedError ('estimator is None _transform method probably needs to be implemented in subclass')

    def show_result_statistics (self, result=None, training_data_flag=False) -> None:
        """
        Show statistics of transformed data.
        
        Parameters
        ----------
        result: DataFrame or other data structure or None, optional
            Transformed data whose statistics we show. If not provided, it is loaded 
            from disk.
        training_data_flag: bool, optional
            If True, transformed training data is loaded, otherwise transformed test 
            data is loaded.
        """
        if result is None:
            self.set_training_data_flag (training_data_flag)
            df = self.data_io.load_result()
        else:
            df = result
        
        if df is not None:
            display (self.name)
            if callable(getattr(df, 'describe', None)):
                display (df.describe())

    def assert_equal (self, path_reference_results: str, assert_equal_func=pd.testing.assert_frame_equal, **kwargs):
        """
        Check whether the transformed data is the same as the reference data stored in given path.
        
        Parameters
        ----------
        path_reference_results: str
            Path where reference results are stored. The path does not include the 
            file name, since this is stored as a field of data_io.
        assert_equal_func: function, optional
            Function used to check whether the values are the same. By defaut, 
            `pd.testing.assert_frame_equal` is used, which assumes the data type is 
            DataFrame.
        
        """
        type_result = 'training' if self.data_io.training_data_flag else 'test'
        self.logger.info (f'comparing {type_result} results for {self.class_name}')
        
        self.logger.info (f'loading...')
        current_results = self.data_io.load_result ()
        if self.data_io.training_data_flag:
            path_to_reference_file = Path(path_reference_results) / self.data_io.result_file_name_training
        else:
            path_to_reference_file = Path(path_reference_results) / self.data_io.result_file_name_test
        reference_results = self.data_io._load (path_to_reference_file, self.data_io.result_load_func)
        self.logger.info (f'comparing...')
        assert_equal_func (current_results, reference_results, **kwargs)
        self.logger.info (f'equal results\n')

    # ********************************
    # setters
    # ********************************
    def set_training_data_flag (self, training_data_flag):
        self.data_io.set_training_data_flag (training_data_flag)

    def set_save_result_flag_test (self, save_result_flag_test):
        self.data_io.set_save_result_flag_test (save_result_flag_test)

    def set_save_result_flag_training (self, save_result_flag_training):
        self.data_io.set_save_result_flag_training (save_result_flag_training)

    def set_save_result_flag (self, save_result_flag):
        self.data_io.set_save_result_flag (save_result_flag)

    def set_overwrite (self, overwrite):
        self.data_io.set_overwrite (overwrite)

    def set_save_fitting (self, save_fitting):
        self.data_io.set_save_fitting (save_fitting)

# ******************************************
# Subclasses of Component.
# Most of these are basically the same as GenericComponent, the only difference being that some parameters
# are over-riden when constructing the object, to force a specific behavior
# ******************************************

In [None]:
show_doc (Component, name='Component', title_level=3)
show_doc (Component.__init__, name='__init__', title_level=4)
show_doc (Component.fit, name='fit', title_level=4)
show_doc (Component.transform, name='transform', title_level=4)
show_doc (Component.predict, name='predict', title_level=4)
show_doc (Component.show_result_statistics, name='show_result_statistics', title_level=4)

## Sub-classes

In [None]:
#export
class SamplingComponent (Component):
    """
    Component that makes use of labels in transform method.
    
    When calling the transform method, one of the columns of the received data 
    is assumed to contain the ground-truth labels. This allows the transform 
    method to modify the number of observations, changing the number of rows in 
    the data and in the labels. See `PandasConverter` class in 
    `block_types.core.data_conversion`.
    """
    def __init__ (self,
                  estimator=None,
                  transform_uses_labels=True,
                  **kwargs):

        # the SamplingComponent over-rides the following parameters:
        super().__init__ (estimator=estimator,
                          transform_uses_labels=True,
                          **kwargs)

In [None]:
show_doc (SamplingComponent, title_level=3)

In [None]:
#export
class SklearnComponent (Component):
    """
    Component that saves estimator parameters in pickle format.
    
    Convenience subclass used when the estimated parameters can be saved in 
    pickle format. Note that data transformed by this component is still 
    saved in parquet format by default, since DataFrame is the default format.
    These defaults, however, can change. See `SklearnIO` class in 
    `core.utils`.
    """
    def __init__ (self,
                  estimator=None,
                  transform_uses_labels=False,
                  **kwargs):

        data_io = SklearnIO (**kwargs)
        
        # the SklearnComponent over-rides the following parameters:
        super().__init__ (estimator=estimator,
                          data_io = data_io,
                          transform_uses_labels=False,
                          **kwargs)

In [None]:
show_doc (SklearnComponent, name = 'SklearnComponent', title_level=3)

In [None]:
#export
class NoSaverComponent (Component):
    """Component that does not save any data."""
    def __init__ (self,
                  estimator=None,
                  **kwargs):

        data_io = NoSaverIO (**kwargs)
        
        # the NoSaverComponent over-rides the following parameters:
        super().__init__ (estimator=estimator,
                          data_io=data_io,
                          **kwargs)

In [None]:
show_doc (NoSaverComponent, name = 'SklearnComponent', title_level=3)

In [None]:
#export
class OneClassSklearnComponent (SklearnComponent):
    """Component that uses only normal data (labelled with 0) for fitting parameters."""
    def __init__ (self,
                  estimator=None,
                  **kwargs):
        super().__init__ (estimator=estimator,
                          **kwargs)

    def _fit (self, X, y=None):
        assert y is not None, 'y must be provided in OneClassSklearnComponent class'
        X = X[y==0]

        assert self.estimator is not None, 'estimator must be provided in OneClassSklearnComponent class'
        self.estimator.fit (X, y)

In [None]:
show_doc (OneClassSklearnComponent, name = 'OneClassSklearnComponent', title_level=3)

In [None]:
#export
# TODO: consider removing the following PandasComponent class, since we already have this as default in Component
class PandasComponent (Component):
    """Component that preserves the DataFrame format for incoming data and results.
    
    This class is redundant at the moment, since this is done by default by the 
    `Component` class. We might remove this class in future versions.
    See `PandasConverter` in `core.data_conversion` for details on the data 
    conversion performed."""
    def __init__ (self,
                  estimator=None,
                  data_converter=None,
                  transform_uses_labels=False,
                  transformed_index=None,
                  transformed_columns=None,
                  **kwargs):

        data_converter = PandasConverter (transform_uses_labels=transform_uses_labels,
                                          transformed_index=transformed_index,
                                          transformed_columns=transformed_columns,
                                          **kwargs)

        super().__init__ (estimator=estimator,
                          data_converter=data_converter,
                          **kwargs)

In [None]:
show_doc (PandasComponent, name='PandasComponent', title_level=3)

## Pipeline

> Base `Pipeline` class. For specific pipelines see `core.pipeline` module

In [None]:
#export
class Pipeline (SamplingComponent):
    """
    Pipeline composed of a list of components that run sequentially.
    
    During training, the components of the list are trained one after the other, 
    where one component is fed the result of transforming the data with the list 
    of components located before in the pipeline.
    
    The `Pipeline` class is a subclass of `SamplingComponent`, which itself is a 
    subclass of `Component`. This provides the functionality of `Component` 
    to any implemented pipeline, such as logging the messages, loading / saving the 
    results, and convert the data format so that it can work as part of other 
    pipelines with potentially other data formats.
    
    The `SamplingComponent` class allows the `transform` method to receive both data and 
    labels. Furthermore, the Pipeline constructor sets `separate_labels=False` by default,
    which means that the `fit` method receives the labels combined with the data in the same 
    input `X`. The rationale for this is to have some components in the pipeline that are 
    `SamplingComponent` themselves. Such components need the labels to be passed to the 
    `transform` method, and this method is called during `fit` for all the components except 
    the last one.
    """
    def __init__ (self, separate_labels = False, **kwargs):
        """Assigns attributes and calls parent constructor.

        Parameters
        ----------
        separate_labels: bool, optional
            whether or not the fit method receives the labels in a separate `y` vector 
            or in the same input `X`, as an additional variable. See description of 
            Pipeline class for more details.
        """

        self.components = []

        # we need to create pipeline before calling super().__init__(), since the constructor of Component calls
        # a method that is overriden in Pipeline, and this method makes use of components field
        super().__init__ (separate_labels = separate_labels, 
                          **kwargs)

        self.set_training_data_flag(False)

    @classmethod
    def create_pipeline(cls, components, **kwargs):
        """Create `Pipeline` object of class `cls`, given `components` list."""
        pipeline = cls(**kwargs)
        pipeline.components = components
        pipeline.set_training_data_flag(False)
        return pipeline

    def _fit (self, X, y=None):
        """
        Fit components of the pipeline, given data X and labels y.
        
        By default, y will be None, and the labels are part of `X`, as a variable.
        """
        self.set_training_data_flag (True)
        for component in self.components[:-1]:
            X = component.fit_transform (X, y)
        self.components[-1].fit (X, y)
        # self.set_training_data_flag (False)

    def _predict (self, X):
        """Transform data with components of pipeline, and predict labels with last component. 
        
        In the current implementation, we consider prediction a form of mapping, 
        and therefore a special type of transformation."""
        self.set_training_data_flag (False)
        for component in self.components:
            X = component.transform (X)

        return X

    def construct_diagram (self, training_data_flag=None, include_url=False, port=4000, project='block_types'):
        """
        Construct diagram of the pipeline components, data flow and dimensionality.
        
        By default, we use test data to show the number of observations 
        in the output of each component. This can be changed passing 
        `training_data_flag=True`
        """
        training_data_flag = self.get_training_data_flag (training_data_flag)

        if include_url:
            base_url = f'http://localhost:{port}/{project}'
        else:
            URL = ''

        node_name = 'data'
        output = 'train / test'

        f = Digraph('G', filename='fsm2.svg')
        f.attr('node', shape='circle')

        f.node(node_name)

        f.attr('node', shape='box')
        for component in self.components:
            last_node_name = node_name
            last_output = output
            node_name = component.model_plotter.get_node_name()
            if include_url:
                URL = f'{base_url}/{component.model_plotter.get_module_path()}.html#{node_name}'
            f.node(node_name, URL=URL)
            f.edge(last_node_name, node_name, label=last_output)
            output = component.model_plotter.get_edge_name(training_data_flag=training_data_flag)

        last_node_name = node_name
        node_name = 'output'
        f.attr('node', shape='circle')
        f.edge(last_node_name, node_name, label=output)

        return f

    def show_result_statistics (self, training_data_flag=None):
        """
        Show statistics about results obtained by each component. 
        
        By default, this is shown on test data, although this can change setting 
        `training_data_flag=True`
        """
        training_data_flag = self.get_training_data_flag (training_data_flag)

        for component in self.components:
            component.show_result_statistics(training_data_flag=training_data_flag)

    def show_summary (self, training_data_flag=None):
        """
        Show list of pipeline components, data flow and dimensionality.
        
        By default, we use test data to show the number of observations 
        in the output of each component. This can be changed passing 
        `training_data_flag=True`
        """
        training_data_flag = self.get_training_data_flag (training_data_flag)

        node_name = 'data'
        output = 'train / test'

        for i, component in enumerate(self.components):
            node_name = component.model_plotter.get_node_name()
            output = component.model_plotter.get_edge_name(training_data_flag=training_data_flag)
            print (f'{"-"*100}')
            print (f'{i}: {node_name} => {output}')


    def get_training_data_flag (self, training_data_flag=None):
        if training_data_flag is None:
            if self.data_io.training_data_flag is not None:
                training_data_flag = self.data_io.training_data_flag
            else:
                training_data_flag = False

        return training_data_flag

    def assert_equal (self, path_reference_results, assert_equal_func=pd.testing.assert_frame_equal, **kwargs):
        """Compare results stored in current run against reference results stored in given path."""

        for component in self.components:
            component.assert_equal (path_reference_results, assert_equal_func=assert_equal_func, **kwargs)
        self.logger.info ('both pipelines give the same results')
        print ('both pipelines give the same results')

    # *************************
    # setters
    # *************************
    def set_training_data_flag (self, training_data_flag):
        super().set_training_data_flag (training_data_flag)
        for component in self.components:
            component.set_training_data_flag (training_data_flag)

    def set_save_result_flag_test (self, save_result_flag_test):
        super().set_save_result_flag_test (save_result_flag_test)
        for component in self.components:
            component.set_save_result_flag_test (save_result_flag_test)

    def set_save_result_flag_training (self, save_result_flag_training):
        super().set_save_result_flag_training (save_result_flag_training)
        for component in self.components:
            component.set_save_result_flag_training (save_result_flag_training)

    def set_save_result_flag (self, save_result_flag):
        super().set_save_result_flag (save_result_flag)
        for component in self.components:
            component.set_save_result_flag (save_result_flag)

    def set_overwrite (self, overwrite):
        super().set_overwrite (overwrite)
        for component in self.components:
            component.set_overwrite (overwrite)

    def set_save_fitting (self, save_fitting):
        super().set_save_fitting (save_fitting)
        for component in self.components:
            component.set_save_fitting (save_fitting)

In [None]:
show_doc (Pipeline, title_level=3)
show_doc (Pipeline.__init__, name='__init__', title_level=4)
show_doc (Pipeline.construct_diagram, name='construct_diagram', title_level=4)
show_doc (Pipeline.show_summary, name='show_summary', title_level=4)
show_doc (Pipeline.show_result_statistics, name='show_result_statistics', title_level=4)
show_doc (Pipeline.assert_equal, name='assert_equal', title_level=4)

In [None]:
# export
def pipeline_factory (pipeline_class, **kwargs):
    """Creates a pipeline object given its class `pipeline_class`
    
    Parameters
    ----------
    pipeline_class : class or str
        Name of the pipeline class used for creating the object. 
        This can be either of type string or class.
    """
    if type(pipeline_class) is str:
        Pipeline = eval(pipeline_class)
    elif type(pipeline_class) is type:
        Pipeline = pipeline_class
    else:
        raise ValueError (f'pipeline_class needs to be either string or class, we got {pipeline_class}')

    return Pipeline (**kwargs)

In [None]:
show_doc (pipeline_factory, title_level=2)