In [1]:
# hide
# default_exp core.block_types
from nbdev.showdoc import *
from block_types.utils.nbdev_utils import nbdev_setup, TestRunner

nbdev_setup ()
tst = TestRunner (targets=['dummy'])
    
from block_types.core.block_types import __all__

# Block types

> Types of blocks

In [2]:
#export
from functools import partialmethod
from typing import Optional, Union
import copy
import pickle
from pathlib import Path
import re

from sklearn.utils import Bunch
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import joblib
from IPython.display import display

# block_types
from block_types.core.data_conversion import (DataConverter, NoConverter, PandasConverter, 
                                              StandardConverter, GenericConverter, 
                                              data_converter_factory)
from block_types.core.utils import (save_csv,  save_parquet,  save_multi_index_parquet, 
                                    save_keras_model,  save_csv_gz, read_csv, read_csv_gz)
from block_types.core.utils import DataIO, SklearnIO, PandasIO, NoSaverIO
from block_types.core.utils import data_io_factory
from block_types.core.utils import ModelPlotter, Profiler, Comparator
from block_types.core.utils import camel_to_snake, snake_to_camel
from block_types.utils.utils import (set_logger, delete_logger, replace_attr_and_store,  
                                     get_specific_dict_param, get_hierarchy_level)
import block_types.config.bt_defaults as dflt

In [3]:
#for tests
import pytest
import numpy as np
import os
import joblib
from sklearn.utils import Bunch
from pathlib import Path

import block_types.config.bt_defaults as dflt
from block_types.utils.utils import remove_previous_results, check_last_part
from block_types.core.data_conversion import DataConverter

## Component

In [4]:
#export
class Component ():
    """Base component class used in our Pipeline."""
    def __init__ (self,
                  estimator=None,
                  name: Optional[str] = None,
                  class_name: Optional[str] = None,
                  suffix: Optional[str] = None,
                  group: str = dflt.group,
                  root=None,
                  overwrite_field: bool = dflt.overwrite_field,
                  error_if_present: bool = dflt.error_if_present,
                  ignore:set = set(),
                  but: Union[str, list] = '',
                  data_converter: Optional[DataConverter] = None,
                  data_io: Optional[DataIO] = None,
                  model_plotter: Optional[ModelPlotter] = None,
                  profiler: Optional[Profiler] = None,
                  comparator: Optional[Comparator] = None,
                  apply = None,
                  direct_apply: bool = False,
                  direct_fit: bool = False,
                  direct_fit_apply: bool = False,
                  error_if_apply: bool = False,
                  error_if_fit: bool = False,
                  error_if_fit_apply: bool = False,
                  logger=None,
                  verbose: int = dflt.verbose,
                  name_logger:str = dflt.name_logger,
                  mode_logger:str = dflt.mode_logger,
                  **kwargs):
        
        """
        Initialize attributes and fields.
        
        Parameters
        ----------
        estimator : estimator (classifier or transformer) or None, optional
            Estimator being wrapped.
        name : Pipeline or None, optional
            Name of component. If not provided, it is inferred from the name of the 
            estimator's class, or the name of the custom class defining the componet.
        data_converter : DataConverter or None, optional
            Converts incoming data to format expected by component, and convert 
            outgoing result to format expected by caller.
        data_io : DataIO or None, optional
            Manages data serialization and deserialization.
        model_plotter : ModelPlotter or None, optional
            Helper object that allows to retrieve information to be shown about this 
            component, as part of a Pipeline diagram.
        logger : logging.logger or None, optional
            Logger used to write messages
        verbose : int, optional
            Verbosity, 0: warning or critical, 1: info, 2: debug.
        """

        assert not isinstance(estimator, Component), 'estimator cannot be an instance of Component'
        
        # name of current component, for logging and plotting purposes
        self._determine_component_name (name, estimator, class_name=class_name, suffix=suffix, apply=apply)
        
        # obtain hierarchy_level
        self.hierarchy_level = get_hierarchy_level (base_class=Component)
        
        # store __init__ attrs into `self`
        but = ', '.join (but) if isinstance(but, list) else but
        but = (but + ', ') if len(but)>0 else but
        but = but + 'ignore, but, overwrite_field, error_if_present, path_results, path_models, apply'
        if isinstance (ignore, str): ignore = set(re.split(', *', ignore))
        ignore.update ({'name', 'class_name', 'suffix', 'apply'})
        replace_attr_and_store (base_class=Component, but=but, 
                                error_if_present=error_if_present, overwrite=overwrite_field, 
                                ignore=ignore)
        
        if self.logger is None:
            self.logger = set_logger (self.name_logger, verbose=self.verbose, mode=self.mode_logger)
            
        # obtain class-specific kwargs
        kwargs = self.obtain_config_params (**kwargs)

        # object that manages loading / saving
        if self.data_io is None:
            self.data_io = DataIO (component=self, **kwargs)
        else:
            if 'data_io' in kwargs:
                del kwargs['data_io']
            self.data_io = data_io_factory (self.data_io, component=self, **kwargs)
            
        self.path_results = self.data_io.path_results
        self.path_models = self.data_io.path_models

        # data converter
        if self.data_converter is None:
            # TODO: have DataConverter store a reference to component, and use the logger from that reference.
            self.data_converter = GenericConverter (**kwargs)
        else:
            if 'data_converter' in kwargs:
                del kwargs['data_converter']
            self.data_converter = data_converter_factory (self.data_converter, 
                                                          **kwargs)
        # plotting model component
        if self.model_plotter is None:
            self.model_plotter = ModelPlotter (component=self, **kwargs)
        else:
            self.model_plotter.set_component (self)
            
        # profiling computational cost
        if self.profiler is None:
            self.profiler = Profiler (self, **kwargs)
        
        # comparing results against other implementations of this component
        if self.comparator is None:
            self.comparator = Comparator (self, **kwargs)
        elif type(self.comparator) is type:
            self.comparator = self.comparator (self, **kwargs)
            
        # determine and assign apply and fit functions
        self.assign_apply_and_fit_functions (apply=apply)
    
    def __repr__ (self):
        return f'Component {self.class_name} (name={self.name})'
            
    def reset_logger (self):
        delete_logger (self.name_logger)
        
    def get_specific_data_io_parameters (self, tag, **kwargs):
        suffix = f'_{tag}'
        n = len(suffix)
        return {k[:-n]:kwargs[k] 
                for k in kwargs if k.endswith (suffix) and k[:-n] in DataIO.specific_params}
    
    def obtain_config_params (self, tag=None, **kwargs):
        """Overwrites parameters in kwargs with those found in a dictionary of the same name 
        as the component.
        
        Checks if there is a parameter whose name is the name of the class or the name given 
        to this component. In that case, it overwrites the parameters in kwargs with those 
        found in that dictionary. The parameters in kwargs can be used as *global* parameters
        for multiple components, while parameters specific of one component can be overwritten 
        using a dictionary with the name of that component. See example below.
        """
        k = get_specific_dict_param (self, **kwargs)
        
        if k is not None:
            config = kwargs.copy()
            config.update (config[k])
        else:
            config = kwargs
            
        if tag is not None:
            if tag == '__name__': tag = self.name
            self.tag = tag
            config.update (self.get_specific_data_io_parameters (tag, **kwargs))
            
        config.update(verbose=self.verbose, 
                      logger=self.logger)
        
        return config

    def _determine_component_name (self, name: str, estimator, class_name:Optional[str]=None,
                                   suffix:Optional[str]=None, apply=None) -> None:
        """
        Determines an appropriate name for the component if not provided by input.
        
        If not provided, it is inferred from the name of the estimator's class, or 
        the name of the custom class defining the componet.
        """
        if class_name is not None:
            self.class_name = class_name
        else:
            self.class_name = self.__class__.__name__
            if self.class_name in __all__:
                if estimator is not None: self.class_name = estimator.__class__.__name__
                if apply is not None and hasattr (apply, '__name__'): 
                    self.class_name = snake_to_camel (apply.__name__)

        if name is not None:
            self.name = name
        else:
            self.name = camel_to_snake (self.class_name)
        
        self.suffix = suffix
        if self.suffix is not None:
            self.name = f'{self.name}_{self.suffix}'
            
    def create_estimator (self, **kwargs):
        self.estimator = Bunch(**kwargs)
            
    def fit_like (self, *X, y=None, load=None, save=None, split=None,
                  func='_fit', validation_data=None, test_data=None, 
                  sequential_fit_apply=False, converter_args={}, **kwargs):
        """
        Estimates the parameters of the component based on given data X and labels y.
        
        Uses the previously fitted parameters if they're found in disk and load 
        is True.
        """
        self.profiler.start_timer ()
        if self.error_if_fit and func=='_fit': raise RuntimeError (f'{self.name} should not call fit')
        if self.error_if_fit_apply and func=='_fit_apply': 
            raise RuntimeError (f'{self.name} should not call fit_apply')
        X = self.data_converter.convert_single_tuple_for_fitting (X)
        X = X + (y, ) if y is not None else X
        
        if split is not None:
            self.original_split = self.data_io.split
            self.set_split (split)

        self.logger.info (f'fitting {self.name} (using {self.data_io.split} data)')
            
        previous_estimator = None
        if self.data_io.can_load_model (load):
            previous_estimator = self.data_io.load_estimator()
            
        already_computed = False
        if previous_estimator is not None:
            if func=='_fit':
                already_computed = True
            elif func=='_fit_apply':
                previous_result = None
                if self.data_io.can_load_result (load):
                    previous_result = self.data_io.load_result (split=split)
                already_computed = previous_result is not None
            else:
                raise ValueError (f'function {func} not valid')
            
        if not already_computed:
            X = copy.deepcopy (X) if self.data_converter.inplace else X
            if func=='_fit_apply': 
                X = self.data_converter.convert_before_fit_apply (
                    *X, sequential_fit_apply=sequential_fit_apply, **converter_args)
                X = self.data_converter.convert_no_tuple (X)
            elif func=='_fit':
                X = self.data_converter.convert_before_fitting (*X)
            else:
                raise ValueError (f'function {func} not valid')
            additional_data= self._add_validation_and_test (validation_data, test_data)
            if func=='_fit':
                if len(kwargs) > 0: raise AttributeError (f'kwargs: {kwargs} not valid')
                self.profiler.start_no_overhead_timer ()
                self._fit (*X, **additional_data)
            elif func=='_fit_apply':
                assert self.fit_apply_func is not None, ('object must have _fit_apply method or one of '
                                                    'its aliases implemented when func="_fit_apply"')
                self.profiler.start_no_overhead_timer ()
                result = self.fit_apply_func (*X, **additional_data, **kwargs)
            else:
                raise ValueError (f'function {func} not valid')
            self.profiler.finish_no_overhead_timer (method=func, split=self.data_io.split)
            if func=='_fit':
                _ = self.data_converter.convert_after_fitting (*X)
            elif func=='_fit_apply':
                result = self.data_converter.convert_after_fit_apply (
                    result, sequential_fit_apply=sequential_fit_apply, **converter_args)
                if self.data_io.can_save_result (save, split):
                    self.data_io.save_result (result, split=split)
            else:
                raise ValueError (f'function {func} not valid')
            if self.data_io.can_save_model (save):
                self.data_io.save_estimator ()
        else:
            self.set_estimator (previous_estimator)
            self.logger.info (f'loaded pre-trained {self.name}')
            if func=='_fit_apply':
                result = previous_result
                self.logger.info (f'loaded pre-computed result')
            
        self.profiler.finish_timer (method=func, split=self.data_io.split)
            
        if split is not None:
            self.set_split (self.original_split)
            
        if func=='_fit':
            return self
        else:
            return result
     
    fit = partialmethod (fit_like, func='_fit')
    
    def fit_apply (self, *X, y=None, load_model=None, save_model=None, 
                   load_result=None, save_result=None, func='_fit', 
                   validation_data=None, test_data=None, sequential_fit_apply=False, 
                   **kwargs):
        
        if self.error_if_fit_apply: raise RuntimeError (f'{self.name} should not call fit_apply')
        
        X = self.data_converter.convert_single_tuple_for_fitting (X)
        X = X + (y, ) if y is not None else X
        
        if self.fit_apply_func is not None:
            return self.fit_like (*X, 
                                  load=load_model, save=save_model, 
                                  func='_fit_apply', validation_data=validation_data,
                                  test_data=test_data, sequential_fit_apply=sequential_fit_apply,
                                  **kwargs)
        else: 
            if not self.direct_fit:
                kwargs_fit = dict(load=load_model, save=save_model, 
                                  validation_data=validation_data, 
                                  test_data=test_data, 
                                  sequential_fit_apply=sequential_fit_apply)
            else:
                kwargs_fit = dict()
            if not self.direct_apply:
                kwargs_apply = dict (load=load_result, save=save_result, fit_apply=True, 
                                     sequential_fit_apply=sequential_fit_apply, **kwargs)
            else:
                kwargs_apply = kwargs
            return self.fit (*X, **kwargs_fit).apply (*X, **kwargs_apply)
    
    def _add_validation_and_test (self, validation_data, test_data):
        additional_data = {}
        def add_data (data, split_name):
            if data is not None:
                if isinstance(data, tuple):
                    if len(data) > 0:
                        newX = data[0]
                    else:
                        self.logger.warning (f'empty {split_name}')
                        newX = None
                    if len(data) == 2:
                        newy = data[1]
                    elif len(data)==1:
                        newy = None
                    elif len(data)>2:
                        raise ValueError (f'{split_name} must have at most 2 elements')
                else:
                    newX = data
                    newy = None
                newX, newy = self.data_converter.convert_before_fitting (newX, newy)
                if newy is not None:
                    additional_data[split_name] = (newX, newy)
                else:
                    additional_data[split_name] = newX
        
        add_data (validation_data, 'validation_data')
        add_data (test_data, 'test_data')
        
        return additional_data
    
    # aliases
    fit_transform = fit_apply
    fit_predict = fit_apply

    def __call__ (self, *X, load=None, save=None, fit_apply=False, sequential_fit_apply=False, **kwargs):
        """
        Transforms the data X and returns the transformed data.
        
        Uses the previously transformed data if it's found in disk and load 
        is True.
        """
        self.profiler.start_timer ()
        if self.direct_apply: return self.result_func (*X, **kwargs)
        if self.error_if_apply: raise RuntimeError (f'{self.name} should not call apply')
        assert self.result_func is not None, 'apply function not implemented'
        result = self._compute_result (X, self.result_func, load=load, save=save, fit_apply=fit_apply, 
                                       sequential_fit_apply=sequential_fit_apply, **kwargs)
        return result

    def _assign_fit_func (self):
        self.fit_func = None
        self.estimator_fit_func = None
        if callable(getattr(self, '_fit', None)):
            self.fit_func = self._fit
        elif self.estimator is not None and callable(getattr(self.estimator, 'fit', None)):
            self.fit_func = self.estimator.fit
            self.estimator_fit_func = 'fit'
    
    def _assign_result_func (self):
        implemented = []
        self.result_func = None
        self.estimator_result_func = None
        if callable(getattr(self, '_apply', None)):
            self.result_func = self._apply
            implemented += [self.result_func]
        if callable(getattr(self, '_transform', None)):
            self.result_func = self._transform
            implemented += [self.result_func]
        if callable(getattr(self, '_predict', None)):
            self.result_func = self._predict
            implemented += [self.result_func]
        if len(implemented)==0:
            if self.estimator is not None and callable(getattr(self.estimator, 'transform', None)):
                self.result_func = self.estimator.transform
                self.estimator_result_func = 'transform'
                implemented += [self.result_func]
            if self.estimator is not None and callable(getattr(self.estimator, 'predict', None)):
                self.result_func = self.estimator.predict
                self.estimator_result_func = 'predict'
                implemented += [self.result_func]
        if len(implemented) > 1:
            raise AttributeError (f'{self.class_name} must have only one of _transform, _apply, '
                                  f'or _predict methods implemented => found: {implemented}')
        
    def _assign_fit_apply_func (self):
        implemented = []
        self.fit_apply_func = None
        self.estimator_fit_apply_func = None
        if callable(getattr(self, '_fit_apply', None)):
            self.fit_apply_func = self._fit_apply
            implemented += [self.fit_apply_func]
        if callable(getattr(self, '_fit_transform', None)):
            self.fit_apply_func = self._fit_transform
            implemented += [self.fit_apply_func]
        if callable(getattr(self, '_fit_predict', None)):
            self.fit_apply_func = self._fit_predict
            implemented += [self.fit_apply_func]
        if len(implemented)==0:
            if self.estimator is not None and callable(getattr(self.estimator, 'fit_transform', None)):
                self.fit_apply_func = self.estimator.fit_transform
                self.estimator_fit_apply_func = 'fit_transform'
                implemented += [self.fit_apply_func]
            if self.estimator is not None and callable(getattr(self.estimator, 'fit_predict', None)):
                self.fit_apply_func = self.estimator.fit_predict
                self.estimator_fit_apply_func = 'fit_predict'
                implemented += [self.fit_apply_func]
        if len(implemented) > 1:
            raise AttributeError (f'{self.class_name} must have only one of fit_transform, fit_apply, '
                                  f'or fit_predict methods implemented => found: {implemented}')
        
    def assign_apply_and_fit_functions (self, apply=None):
        """Determine and assign apply and fit functions."""
        if apply is not None: self._apply = apply
        self._assign_result_func ()
        self._assign_fit_apply_func ()
        self._assign_fit_func ()
        self.is_model = True
        if self.fit_func is None:
            self._fit = self._fit_
            if self.fit_apply_func is None:
                self.is_model = False
        else:
            self._fit = self.fit_func
        if self.direct_apply:
            self.set_apply (self.result_func)
        if not self.is_model:
            self.fit = self._fit_
            # self.set_fit_apply (self.apply)
        else:
            if self.direct_fit:
                self.fit = self.fit_func
            if self.direct_fit_apply:
                self.set_fit_apply (self.fit_apply_func)
            
    # aliases for transform method
    apply = __call__
    transform = __call__
    predict = partialmethod (__call__, converter_args=dict(new_columns=['prediction']))

    def _compute_result (self, X, result_func, load=None, save=None, split=None,
                         converter_args={}, fit_apply=False, 
                         sequential_fit_apply=False, **kwargs):
        
        if split is not None:
            self.original_split = self.data_io.split
            self.set_split (split)
            
        self.logger.debug (f'applying {self.name} (on {self.data_io.split} data)')
            
        previous_result = None
        if self.data_io.can_load_result (load):
            previous_result = self.data_io.load_result (split=split)
        if previous_result is None:
            X = self.data_converter.convert_single_tuple_for_transforming (X)
            X = self.data_converter.convert_before_transforming (
                *X, fit_apply=fit_apply, sequential_fit_apply=sequential_fit_apply, **converter_args)
            X = self.data_converter.convert_no_tuple (X)
            self.profiler.start_no_overhead_timer ()
            X = self.data_converter.convert_single_tuple_for_result_func (X)
            result = result_func (*X, **kwargs)
            self.profiler.finish_no_overhead_timer ('apply', self.data_io.split)
            result = self.data_converter.convert_after_transforming (
                result, fit_apply=fit_apply, sequential_fit_apply=sequential_fit_apply, **converter_args)
            if self.data_io.can_save_result (save, split):
                self.data_io.save_result (result, split=split)
        else:
            result = previous_result
            self.logger.info (f'loaded pre-computed result')
            
        self.profiler.finish_timer ('apply', self.data_io.split)
        if split is not None:
            self.set_split (self.original_split)
            
        return result

    def _fit_ (self, *X, **kwargs):
        return self
                        
    def show_result_statistics (self, result=None, split=None) -> None:
        """
        Show statistics of transformed data.
        
        Parameters
        ----------
        result: DataFrame or other data structure or None, optional
            Transformed data whose statistics we show. If not provided, it is loaded 
            from disk.
        training_data_flag: bool, optional
            If True, transformed training data is loaded, otherwise transformed test 
            data is loaded.
        """
        if result is None:
            df = self.load_result(split=split)
        else:
            df = result
        
        if df is not None:
            display (self.name)
            if callable(getattr(df, 'describe', None)):
                display (df.describe())
            elif isinstance(df, np.ndarray) or isinstance(df, list):
                df = pd.DataFrame (df)
                display (df.describe())
                
    def remove_non_pickable_fields (self):
        pass

    # ********************************
    # exposing some data_io and data_converters methods
    # ********************************
    def load_estimator (self):
        estimator = self.data_io.load_estimator ()
        if estimator is not None:
            self.set_estimator (estimator)
            
    def load_result (self, split=None, path_results=None, result_file_name=None):
        return self.data_io.load_result (split=split, path_results=path_results, 
                                         result_file_name=result_file_name)
    
    def assert_equal (self, item1, item2=None, split=None, raise_error=True, **kwargs):
        return self.comparator.assert_equal (item1, item2=item2, split=split, 
                                             raise_error=raise_error, **kwargs)
        
    # ********************************
    # setters
    # ********************************
    def set_split (self, split):
        self.data_io.set_split (split)
    
    def set_save_splits (self, save_splits):
        self.data_io.set_save_splits (save_splits)

    def set_save_model (self, save_model):
        self.data_io.set_save_model (save_model)
        
    def set_load_model (self, load_model):
        self.data_io.set_load_model (load_model)
        
    def set_save_result (self, save_result):
        self.data_io.set_save_result (save_result)
        
    def set_load_result (self, load_result):
        self.data_io.set_load_result (load_result)
        
    def set_data_io (self, data_io, copy=False):
        self.data_io = copy.copy(data_io) if copy else data_io
        self.data_io.setup (self)

    def set_name (self, name):
        self.name = name
        self.data_io.set_file_names (name)
        
    def set_estimator (self, estimator):
        self.estimator = estimator
        if self.estimator_result_func is not None:
            self.result_func = getattr (self.estimator, self.estimator_result_func, None)
            assert callable (self.result_func)
        if self.estimator_fit_apply_func is not None:
            self.fit_apply_func = getattr (self.estimator, self.estimator_fit_apply_func, None)
            assert callable (self.fit_apply_func)
        if self.estimator_fit_func is not None:
            self.fit_func = getattr (self.estimator, self.estimator_fit_func, None)
            assert callable (self.fit_func)
            self._fit = self.fit_func
            assert self.is_model
            
    def set_apply (self, result_func):
        self.apply = result_func
        self.__call__ = result_func
        self.transform = result_func
        self.predict = result_func
    
    def set_fit_apply (self, fit_apply_func):
        self.fit_apply = fit_apply_func
        self.fit_transform = fit_apply_func
        self.fit_predict = fit_apply_func

### Configuring component with global and specific parameters

In [5]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_component_config ():
    
    # **********************************************************************
    # test obtain_config_params method
    # **********************************************************************
    tr = Component(name='sky')
    config = dict(first=1,
                  second=2,
                  third=3,
                  sky=dict (second=4)
                 )
    config_r = tr.obtain_config_params (**config)
    logger = set_logger (dflt.name_logger, verbose=dflt.verbose)
    assert config_r=={'first': 1, 'second': 4, 'third': 3, 'sky': {'second': 4}, 'verbose': dflt.verbose, 'logger': logger}
    assert config == {'first': 1, 'second': 2, 'third': 3, 'sky': {'second': 4}}

    # **********************************************************************
    # test that component saves results when using global 
    # parameter save=True
    # **********************************************************************
    class MyTransform (Component):
        def __init__ (self,**kwargs):
            super().__init__ (**kwargs)
            self.create_estimator ()

        def _fit (self, X, y=None):
            self.estimator.mu = X.mean()
        def _transform (self, X):
            return X-self.estimator.mu

    path_results = 'testing_configuration'
    tr = MyTransform (path_results=path_results,
                      save = True)

    X = np.array([[1,2,3],[4,5,6]])
    tr.fit_transform(X)

    import os
    l = sorted(os.listdir(path_results))
    assert l==['models','whole'], f'found: {l}'

    # **********************************************************************
    # test that component does not save results when we 
    # use component-specific parameter MyTransform = dict(save=False)
    # **********************************************************************
    from block_types.utils.utils import remove_previous_results
    remove_previous_results (path_results)

    tr = MyTransform (data_io = SklearnIO(
                                  path_results='testing_configuration',
                                  save = True,
                                  MyTransform = dict(save=False)
                                )
                     )
    tr.fit_transform(X)
    import pytest
    with pytest.raises(FileNotFoundError):
        os.listdir(path_results)

In [6]:
tst.run (test_component_config, tag='dummy')

running test_component_config


### Recursively storing attrs across class hierarchy

In [7]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_component_store_attrs ():
    # recursively storing __init__ attrs across hiearchy of classes
    class Intermediate (Component):
        def __init__ (self, x=3, y=4, **kwargs):
            super().__init__ (**kwargs)

    class Final (Intermediate):
        def __init__ (self, z=6, h=[2,3,5], **kwargs):
            super().__init__ (**kwargs)

    o = Final (x=9, h=[1,2,4])
    assert o.x==9 and o.y==4 and o.z==6 and o.h==[1,2,4]

    o = Final (y=7, z=10, h=[1,2,4], Final={'h': [9,11,10]})
    assert o.x==3 and o.y==7 and o.z==10 and o.h==[9,11,10]

    # only attributes specific of Final are replaced.
    # trying to replace attributes specific of Intermediate 
    # does not work
    o = Final (y=7, z=10, h=[1,2,4], Intermediate={'y': 12})
    assert o.x==3 and o.y==7 and o.z==10 and o.h==[1,2,4]

    class Intermediate (Component):
        def __init__ (self, x=3, y=4, **kwargs):
            super().__init__ (**kwargs)

    class Final (Intermediate):
        def __init__ (self, z=6, h=[2,3,5], **kwargs):
            super().__init__ (**kwargs)

    o = Final (x=9, h=[1,2,4], group='group_1', group_1={'y': 10, 'z':60})
    assert o.x==9 and o.y==10 and o.z==60 and o.h==[1,2,4]
    
    
     # *******************
    # test using same field in B4 and in A3, but
    # B4 passes that value to A3 in super(),
    # after modifying it
    # *****************
    class A (Component):
        def __init__ (self, x=3, path_results='test_recursive', **kwargs):
            path_results = f'{path_results}/another'
            super ().__init__ (path_results=path_results, error_if_present=True, 
                               **kwargs)
    
    class B (A):
        def __init__ (self, x=30, y=10, **kwargs):
            x = x*2
            super().__init__ (x=x, **kwargs)
            self.ab = A (**kwargs)

    b = B ()
    assert b.x==60 and b.ab.x==3 and b.y==10 and b.path_results==Path('test_recursive/another').resolve()
    
    b = B (x=6, path_results='new_path')
    assert b.x==12 and b.ab.x==3 and b.y==10 and b.path_results==Path('new_path/another').resolve()
    
    # *******************
    # test using same field in C and in A, but
    # the field is modified in a parent B
    # *****************
    class C(B):
        def __init__ (self, x=40, z=100, **kwargs):
            super().__init__ (x=x, **kwargs)
            self.b = B(**kwargs)
            
    with pytest.raises (RuntimeError):
        c = C()
        
    c = C(ignore={'x'})
    assert c.x==80 and c.y==10 and c.z==100 and c.b.x==60 and c.b.y==10
    
    c = C (x=9, ignore={'x'})
    assert c.x==18 and c.y==10 and c.z==100 and c.b.x==60 and c.b.y==10
    
    assert not hasattr(c, 'ignore')

In [8]:
tst.run (test_component_store_attrs, tag='dummy')

running test_component_store_attrs


### Transform method called with different aliases

In [9]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_component_aliases ():

    # test that we can implement _transform and use all the aliases 
    # (transform, predict, apply,  __call__)
    class MyTransform (Component):
        def _transform (self, x):
            return x*2

    my_transform = MyTransform()
    assert my_transform.transform (3) == 6
    assert my_transform.predict (3) == 6
    assert my_transform.apply (3) == 6
    assert my_transform (3) == 6

    # test that we can implement _apply and use all the aliases 
    # (transform, predict, apply and __call__)
    class MyTransform2 (Component):
        def _apply (self, x):
            return x*2

    my_transform2 = MyTransform2()
    assert my_transform2.transform (3) == 6
    assert my_transform2.predict (3) == 6
    assert my_transform2.apply (3) == 6
    assert my_transform2 (3) == 6

    # test that we can implement _predict and use all the aliases 
    # (transform, predict, apply and __call__)
    class MyTransform3 (Component):
        def _predict (self, x):
            return x*2

    my_transform3 = MyTransform3()
    assert my_transform3.transform (3) == 6
    assert my_transform3.predict (3) == 6
    assert my_transform3.apply (3) == 6
    assert my_transform3 (3) == 6

    # test that an exception is raised if neither _tranform nor _apply are defined
    class MyTransform4 (Component):
        def _wrong_method (self, x):
            return x*2

    my_transform4 = MyTransform4 ()

    import pytest
    with pytest.raises (AssertionError):
        my_transform4.transform(3)


    # test that an exception is raised if more than one alias is implemented
    class MyTransform5 (Component):
        def _predict (self, x):
            return x*2
        def _apply (self, x):
            return x*2

    import pytest
    with pytest.raises(AttributeError):
        my_transform5 = MyTransform5 ()

In [10]:
tst.run (test_component_aliases, tag='dummy')

running test_component_aliases


### Calling `predict` is handy when the result is a single array of predictions

In [11]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_component_predict ():
# TODO: remove this cell

    class MyTransform (Component):
        def __init__ (self, **kwargs):
            super().__init__ (
                data_converter=PandasConverter(**kwargs),
                **kwargs)

        def _predict (self, x):
            return x['a']+x['b']

    my_transform = MyTransform()

    df = pd.DataFrame ({'a': [10,20,30],'b':[4,5,6]})

    pd.testing.assert_frame_equal(my_transform.transform (df).to_frame(), 
                                  pd.DataFrame ({0: [14,25,36]})
                                 )

    if False:
        pd.testing.assert_frame_equal(my_transform.predict (df), 
                                      pd.DataFrame ({0: [14,25,36]})
                                     )

In [12]:
tst.run (test_component_predict, tag='dummy')

running test_component_predict


### The `transform` method and its aliases can be called with multiple inputs

In [13]:
# exports tests.core.test_block_types
def test_component_multiple_inputs ():
    # test that we can apply tranform to multiple data items
    from block_types.utils.dummies import SumXY
    
    my_transform = SumXY ()
    result = my_transform.transform (3, 4)
    print (result)
    assert result==7

    # test that we can apply tranform to single data items
    class MyTransform2 (Component):
        def _apply (self, x):
            return x*2

    my_transform2 = MyTransform2 ()
    result = my_transform2.transform (3)
    print (result)
    assert result==6

In [14]:
tst.run (test_component_multiple_inputs, tag='dummy')

running test_component_multiple_inputs
7
6


### `fit_apply()` and its aliases `fit_transform(), fit_predict()`

In [15]:
# exports tests.core.test_block_types
# example with _fit_apply implemented
class TransformWithFitApply (Component):
    def __init__ (self, **kwargs):
        super().__init__ (**kwargs)
    def _fit (self, X, y=None):
        self.sum = X.sum(axis=0)
    def _apply (self, X):
        return X + self.sum
    def _fit_apply (self, X, y=None):
        self.sum = X.sum(axis=0)*10
        return X + self.sum

    # example without _fit_apply implemented
class TransformWithoutFitApply (Component):
    def __init__ (self, **kwargs):
        super().__init__ (**kwargs)
    def _fit (self, X, y=None):
        self.sum = X.sum(axis=0)
    def _apply (self, X):
        return X + self.sum
        
#@pytest.mark.reference_fails
def test_component_fit_apply ():

    tr1 = TransformWithFitApply ()
    X = np.array ([100, 90, 10])
    result = tr1.fit_apply (X)
    assert (result==(X+2000)).all()

    # same result obtained by aliases
    result = tr1.fit_transform (X)
    assert (result==(X+2000)).all()
    
    # different result if we apply fit and apply separately
    result = tr1.fit (X).transform (X)
    assert (result==(X+200)).all()

    # transform without fit_apply
    tr2 = TransformWithoutFitApply ()
    result = tr2.fit_apply (X)
    assert (result==(X+200)).all()

    # same result obtained by aliases
    result = tr2.fit_transform (X)
    assert (result==(X+200)).all()

In [16]:
tst.run (test_component_fit_apply, tag='dummy')

running test_component_fit_apply


### `fit_apply()` with DataConverters that transform inplace

In [17]:
# exports tests.core.test_block_types
# example with _fit_apply implemented
class MyDataConverter (DataConverter):
    def __init__ (self, **kwargs):
        super ().__init__ (**kwargs)
    def convert_before_fitting (self, *X):
        X, y = X if len(X)==2 else (X[0], None)
        self.orig = X[0]
        X[0] = 0
        return X, y
    def convert_after_fitting (self, *X):
        X, y = X if len(X)==2 else (X, None)
        X[0] = self.orig
        return X
    def convert_before_transforming (self, X, **kwargs):
        self.orig2 = X[1]
        X[1] = 0
        return X
    def convert_after_transforming (self, X, **kwargs):
        X[1] = self.orig2
        return X
    def convert_before_fit_apply (self, *X, **kwargs):
        _ = self.convert_before_fitting (*X)
        return self.convert_before_transforming (*X)
        
class TransformWithFitApplyDC (Component):
    def __init__ (self, **kwargs):
        super().__init__ (data_converter=MyDataConverter,**kwargs)
    def _fit (self, X, y=None):
        self.sum = X.sum(axis=0)
    def _apply (self, X):
        return X + self.sum
    def _fit_apply (self, X, y=None):
        self.sum = X.sum(axis=0)
        return X + self.sum

#@pytest.mark.reference_fails
def test_fit_apply_inplace ():
    tr1 = TransformWithFitApplyDC ()
    X = np.array ([100, 90, 10])
    result = tr1.fit_apply (X)
    assert (result==[100,  90, 110]).all()
    assert (X==[100,  90,  10]).all()

    tr1 = TransformWithFitApplyDC (inplace=False)
    X = np.array ([100, 90, 10])
    result = tr1.fit_apply (X)
    assert (result==[10, 90, 20]).all()
    assert (X==[ 0,  0, 10]).all()

In [18]:
tst.run (test_fit_apply_inplace, tag='dummy', do=False)

`_fit_apply()` is called when implemented, otherwise `fit().apply()` is called

### Getting validation_data and test_data

In [19]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_component_validation_test ():
    class Transform1 (Component):
        def __init__ (self, **kwargs):
            super().__init__ (**kwargs)
        def _fit (self, X, y=None, validation_data=None, test_data=None):
            self.sum = X.sum(axis=0)

            print (f'validation_data: {validation_data}')
            print (f'test_data: {test_data}')

            self.validation_data = validation_data
            self.test_data = test_data

        def _apply (self, X):
            return X + self.sum

    tr1 = Transform1 ()
    X = np.array ([100, 90, 10])

    # case 1: validation_data and test_data are not tuples
    validation_data = np.array ([100, 90, 10])*10
    test_data = np.array ([100, 90, 10])*100
    result = tr1.fit_apply (X, validation_data=validation_data, test_data=test_data)
    assert (tr1.validation_data==validation_data).all()
    assert (tr1.test_data==test_data).all()

    # case 2: validation_data is a tuple, and test_data is not given
    result = tr1.fit_apply (X, validation_data=(validation_data,1))
    assert (tr1.validation_data[0]==validation_data).all()
    assert tr1.validation_data[1]==1
    assert tr1.test_data is None

    # case 3: validation_data is a tuple with more than 2 elements, exception is raised
    import pytest
    with pytest.raises(ValueError):
        result = tr1.fit_apply (X, validation_data=(validation_data,1,2))

In [20]:
tst.run (test_component_validation_test, tag='dummy')

running test_component_validation_test
validation_data: [1000  900  100]
test_data: [10000  9000  1000]
validation_data: (array([1000,  900,  100]), 1)
test_data: None


### saving / loading

In [21]:
# exports tests.core.test_block_types

# example with _fit_apply implemented
class TransformWithoutFitApply2 (Component):
    def __init__ (self, error_if_fit_func=False, error_if_apply_func=False,  **kwargs):
        super().__init__ (data_io='SklearnIO', **kwargs)
        self.estimator = Bunch(sum=None)
    def _fit (self, X, y=None):
        if self.error_if_fit_func: raise RuntimeError ('fit should not run')
        print ('running _fit')
        self.estimator.sum = X.sum(axis=0)
    def _apply (self, X):
        if self.error_if_apply_func: raise RuntimeError ('apply should not run')
        if self.estimator.sum is None: raise RuntimeError ('fit should be called before apply')
        print ('running _apply')
        return X + self.estimator.sum

Transform1 = TransformWithoutFitApply2

class TransformWithFitApply2 (Component):
    def __init__ (self, error_if_fit_func=False, error_if_apply_func=False, error_if_fit_apply_func=False, 
                  **kwargs):
        super().__init__ (data_io='SklearnIO', **kwargs)
        self.estimator = Bunch(sum=None)
    def _fit (self, X, y=None):
        if self.error_if_fit_func: raise RuntimeError ('fit should not run')
        print ('running _fit')
        self.estimator.sum = X.sum(axis=0)
    def _apply (self, X):
        if self.error_if_apply_func: raise RuntimeError ('apply should not run')
        if self.estimator.sum is None: raise RuntimeError ('fit should be called before apply')
        print ('running _apply')
        return X + self.estimator.sum
    def _fit_apply (self, X, y=None):
        if self.error_if_fit_apply_func: raise RuntimeError ('fit_apply should not run')
        print ('running _fit_apply')
        self.estimator.sum = X.sum(axis=0)
        return X + self.estimator.sum

def component_save_data ():
    X = np.array ([100, 90, 10])
    return X
        
#@pytest.mark.reference_fails
def test_component_save_load (component_save_data):
    
    X = component_save_data

    path_results = 'component_loading_saving'
    remove_previous_results (path_results=path_results)

    tr1 = Transform1 (path_results=path_results)
    tr1.fit (X)
    result = tr1.apply (X)

    tr2 = Transform1 (path_results=path_results)
    tr2.load_estimator()
    assert tr2.estimator.sum == tr1.estimator.sum

    result2 = tr2.data_io.load_result ()
    assert (result2 == sum(X)+X).all()

    import os

    assert os.listdir (f'{path_results}/whole')==['transform_without_fit_apply2_result.pk']
    assert os.listdir (f'{path_results}/models')==['transform_without_fit_apply2_estimator.pk']

    result_b = tr1.apply (X*2, split='test')
    result2b = tr2.data_io.load_result (split='test')
    assert (result_b==result2b).all()
    assert os.listdir (f'{path_results}/test')==['transform_without_fit_apply2_result.pk']

    result2b = tr2.data_io.load_result ()
    assert (result_b!=result2b).all()

    remove_previous_results (path_results=path_results)


    # Test that no saving is done if save=False
    tr1 = Transform1 (path_results=path_results, save=False)
    tr1.fit (X)
    result = tr1.apply (X)
    assert not os.path.exists(path_results)


In [22]:
tst.run (test_component_save_load, component_save_data, tag='dummy')

running test_component_save_load
running _fit
running _apply
running _apply
running _fit
running _apply


### running fit / apply depending on whether estimator / result exists

In [23]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_component_run_depend_on_existence ():

    path_results = 'component_run_existence'
    remove_previous_results (path_results=path_results)

    tr1 = TransformWithFitApply2 (path_results=path_results, error_if_fit_func=True, error_if_apply_func=True)
    X = np.array ([100, 90, 10])
    result = tr1.fit_apply (X)
    assert (result==(X+200)).all()

    assert os.listdir(f'{path_results}/models')==['transform_with_fit_apply2_estimator.pk']

    assert os.listdir(f'{path_results}/whole')==['transform_with_fit_apply2_result.pk']

    tr1 = TransformWithFitApply2 (path_results=path_results, error_if_fit_func=True, error_if_apply_func=True,
                                  error_if_fit_func_apply=True)
    result2 = tr1.fit_apply (X)
    assert (result2==(X+200)).all()

    assert tr1.estimator=={'sum': 200}

    tr2 = TransformWithFitApply2 (path_results=path_results, error_if_fit_func=True, error_if_apply_func=True,
                                  error_if_fit_apply_func=True)
    result3 = tr2.apply (X)

    assert (result3==(X+200)).all()
    assert tr2.estimator=={'sum': None}

    os.remove (f'{path_results}/models/transform_with_fit_apply2_estimator.pk')

    with pytest.raises (RuntimeError):
        result3 = tr2.fit_apply (X)

    tr2.error_if_fit_apply_func = False
    result4 = tr2.fit_apply (X)
    assert tr2.estimator=={'sum': 200}
    assert (result4==(X+200)).all()

    os.remove (f'{path_results}/whole/transform_with_fit_apply2_result.pk')

    tr3 = TransformWithFitApply2 (path_results=path_results, error_if_fit_func=True, error_if_apply_func=True,
                                  error_if_fit_apply_func=True)
    with pytest.raises (RuntimeError):
        _ = tr3.apply (X)
    with pytest.raises (RuntimeError):
        _ = tr3.fit_apply (X)
    tr3.error_if_fit_apply_func = False
    result5 = tr3.fit_apply (X)
    assert tr3.estimator=={'sum': 200}
    assert (result5==(X+200)).all()

    assert os.listdir (f'{path_results}/whole')==['transform_with_fit_apply2_result.pk']
    assert os.listdir (f'{path_results}/models')==['transform_with_fit_apply2_estimator.pk']

    remove_previous_results (path_results)

    tr4 = TransformWithFitApply2 (path_results=path_results, error_if_fit_func=False, error_if_apply_func=False,
                                  error_if_fit_apply_func=True)
    result6 = tr4.fit(X).apply (X)
    assert tr4.estimator=={'sum': 200}
    assert (result6==(X+200)).all()
    assert os.listdir (f'{path_results}/whole')==['transform_with_fit_apply2_result.pk']
    assert os.listdir (f'{path_results}/models')==['transform_with_fit_apply2_estimator.pk']

    remove_previous_results (path_results)

    tr5 = TransformWithoutFitApply2 (path_results=path_results, error_if_fit_func=False, error_if_apply_func=False)
    result7 = tr5.fit(X).apply (X)
    assert tr5.estimator=={'sum': 200}
    assert (result7==(X+200)).all()
    assert os.listdir (f'{path_results}/whole')==['transform_without_fit_apply2_result.pk']
    assert os.listdir (f'{path_results}/models')==['transform_without_fit_apply2_estimator.pk']

    remove_previous_results (path_results)

In [24]:
tst.run (test_component_run_depend_on_existence, tag='dummy')

running test_component_run_depend_on_existence
running _fit_apply
running _fit_apply
running _fit_apply
running _fit
running _apply
running _fit
running _apply


### Logger

In [25]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_component_logger (component_save_data):
    
    X = component_save_data
    
    tr1 = Transform1 (verbose=0)
    tr1.fit (X)
    result = tr1.apply (X)

    tr1 = Transform1 (verbose=1)
    tr1.fit (X)
    result = tr1.apply (X)

    tr1 = Transform1 (verbose=2)
    tr1.fit (X)
    result = tr1.apply (X)

In [26]:
tst.run (test_component_logger, component_save_data, tag='dummy')

running test_component_logger
running _fit
running _apply


fitting transform_without_fit_apply2 (using whole data)
fitting transform_without_fit_apply2 (using whole data)
applying transform_without_fit_apply2 (on whole data)


running _fit
running _apply
running _fit
running _apply


### Passing data_converter and data_io

In [27]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_component_data_converter ():
    class MyTransform (Component):
        def __init__ (self, **kwargs):
            super().__init__ (data_converter='PandasConverter',
                              **kwargs)
        def _apply (self, x):
            return x*2

    my_transform = MyTransform (separate_labels=False)
    assert my_transform.data_converter.separate_labels is False
    assert type(my_transform.data_converter) is PandasConverter

    # example where data-converter uses class-specific parameters
    config = dict(separate_labels=False, MyTransform=dict(separate_labels=True))
    my_transform = MyTransform (**config)
    assert my_transform.data_converter.separate_labels is True
    assert config['separate_labels'] is False

In [28]:
tst.run (test_component_data_converter, tag='dummy')

running test_component_data_converter


In [29]:
# example using data_io
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_component_data_io ():
    import pandas as pd
    from block_types.utils.utils import remove_previous_results

    path_results = 'test_data_io'
    remove_previous_results (path_results=path_results)

    class MyTransform (Component):
        def __init__ (self, **kwargs):
            super().__init__ (result_io='pandas',
                              **kwargs)
        def _fit (self, X, y=None):
            self.estimator = Bunch(sum=100)

        def _apply (self, x):
            return pd.DataFrame ([[1,2],[3,4]], columns=['a','b'])

    my_transform = MyTransform (path_results='do_not_use', MyTransform=dict(path_results=path_results))
    my_transform.fit (1)
    assert os.listdir (f'{path_results}/models')==['my_transform_estimator.pk']

    df1 = my_transform.apply (1)
    assert os.listdir (f'{path_results}/whole')==['my_transform_result.parquet']

    assert not os.path.exists ('do_not_use')

    del my_transform
    my_transform = MyTransform (path_results='do_not_use', MyTransform=dict(path_results=path_results))
    #assert my_transform.estimator is None
    my_transform.load_estimator()
    assert my_transform.estimator == Bunch(sum=100)

    df2 = my_transform.load_result ()
    pd.testing.assert_frame_equal (df1, df2)

    remove_previous_results (path_results=path_results)

In [30]:
tst.run (test_component_data_io, tag='dummy')

running test_component_data_io


### assert_equal

In [31]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_component_equal ():
    path_results = 'assert_equal'
    remove_previous_results (path_results=path_results)

    class MyTransform (Component):
        def __init__ (self, noise=1e-10, different = False, **kwargs):
            super().__init__ (result_io='pandas',
                              **kwargs)
        def _fit (self, X, y=None):
            self.estimator = Bunch(sum=100)

        def _generate_noise (self):
            while True:
                noise = np.random.rand() * self.noise
                if noise > self.noise/10:
                    break
            return noise

        def _apply (self, x):
            df = pd.DataFrame ([[1.0,2.0],[3.0,4.0]], columns=['a','b']) + self._generate_noise ()
            if self.different:
                df = df+10
            x = np.array([[10.0,20.0],[30.0,40.0]]) + self._generate_noise ()
            result = dict(sequence=[[1.0,2.0], x+1, dict(vector=x, data=df)],
                          array=x+10)
            return result

    tr = MyTransform ()
    tr2= MyTransform ()
    tr.assert_equal (tr(1), tr2(1), significant_digits=7)

    import pytest
    with pytest.raises (AssertionError):
        tr = MyTransform (noise=1e-3, verbose=1)
        tr2= MyTransform (noise=1e-3, verbose=1)
        tr.assert_equal (tr(1), tr2(1), significant_digits=7)

    with pytest.raises (AssertionError):
        tr = MyTransform (verbose=1, different=True)
        tr2= MyTransform (verbose=1)
        tr.assert_equal (tr(1), tr2(1))

    result = tr.assert_equal (tr(1), tr2(1), raise_error=False)
    assert result is False
    remove_previous_results (path_results=path_results)

In [32]:
tst.run (test_component_equal, tag='dummy')

running test_component_equal


comparing results for my_transform
comparing results for my_transform
comparing results for my_transform
Component my_transform => results are different: [sequence] [2] [data] DataFrame.iloc[:, 0] (column name="a") are different

DataFrame.iloc[:, 0] (column name="a") values are different (100.0 %)
[index]: [0, 1]
[left]:  [11.000000000055376, 13.000000000055376]
[right]: [1.0000000000894484, 3.0000000000894484]


### set_paths

In [33]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_set_paths ():
    def assert_paths (x, path_results, path_models):
        base = os.path.abspath('.')
        assert x.path_results==Path(f'{base}/{path_results}')
        assert x.data_io.path_results==Path(f'{base}/{path_results}')
        assert x.path_models==Path(f'{base}/{path_models}')
        assert x.data_io.path_models==Path(f'{base}/{path_models}')

    path_results = 'test_set_paths_1'
    path_models = 'test_set_paths_1'
    tr = Component (path_results=path_results)
    assert_paths (tr, path_results, path_models)
    path_results = 'test_set_paths_2'
    tr.data_io.set_path_results (path_results)
    assert_paths (tr, path_results, path_models)
    path_models='test_set_paths_models_1'
    tr.data_io.set_path_models (path_models)
    assert_paths (tr, path_results, path_models)

    path_results = 'test_set_paths_a'
    path_models = 'test_set_paths_models_a'
    tr = Component (path_results=path_results, path_models=path_models)
    assert_paths (tr, path_results, path_models)

    path_results = 'test_set_paths_b'
    tr.data_io.set_path_results (path_results)
    assert_paths (tr, path_results, path_models)

    path_models = 'test_set_paths_models_b'
    tr.data_io.set_path_models (path_models)
    assert_paths (tr, path_results, path_models)

In [34]:
tst.run (test_set_paths, tag='dummy')

running test_set_paths


### determine fit function

In [35]:
# exports tests.core.test_block_types
from block_types.utils.dummies import DummyEstimator

class TransformWithoutFit (Component):
    def __init__ (self, factor=2, **kwargs):
        super().__init__ (**kwargs)
    def _apply (self, X):
        return X * self.factor
    
class TransformWithFitApplyOnly (Component):
    def __init__ (self, **kwargs):
        super().__init__ (**kwargs)
    def _apply (self, X):
        return X + self.sum
    def _fit_apply (self, X, y=None):
        self.sum = X.sum(axis=0)*10
        return X + self.sum
    
def test_determine_fit_function ():
    # example when there is _fit implemented
    component = TransformWithoutFitApply ()
    X = np.array ([1,2,3])
    component.fit (X)
    X2 = np.array ([10,20,30])
    r = component (X2)
    assert (r == (X.sum() + X2)).all()
    assert component.is_model

    # example when there is estimator
    component = Component (DummyEstimator (2))
    X = np.array ([1,2,3])
    component.fit (X)
    assert component.estimator.sum == 6
    X2 = np.array ([10,20,30])
    r = component (X2)
    assert (r == (X.sum() + X2*2)).all()
    assert component.is_model

    # example when there is no _fit implemented, and there is no estimator
    component = TransformWithoutFit ()
    X = np.array ([1,2,3])
    component.fit (X)
    X2 = np.array ([10,20,30])
    r = component (X2)
    assert (r == (X2*2)).all()
    assert not component.is_model
    assert component._fit == component._fit_

    # example when there is only fit_apply implemented
    component = TransformWithFitApplyOnly ()
    X2 = np.array ([10,20,30])
    r = component.fit_apply (X2)
    assert (r == (X2 + X2.sum(axis=0)*10)).all()
    assert component.is_model
    assert component._fit == component._fit_
    
def test_use_fit_from_loaded_estimator ():
    path_models = 'test_use_fit_from_loaded_estimator'
    component = Component (DummyEstimator (2), path_models=path_models)
    X = np.array ([1,2,3])
    component.fit (X)
    assert (Path (path_models) / 'models').exists()
    del component
    
    estimator1 = DummyEstimator (2)
    print (estimator1)
    component = Component (estimator1, path_models=path_models)
    print ('before loading')
    print (component.estimator)
    print (component._fit)
    print (component.result_func)
    
    component.load_estimator ()
    print ('after loading')
    print (component.estimator)
    print (component._fit)
    print (component.result_func)
    
    assert component.estimator.sum == 6
    assert component.is_model
    
    X2 = np.array ([10,20,30])
    r = component (X2)
    assert (r == (X.sum() + X2*2)).all()
    
    remove_previous_results (path_models)

In [36]:
tst.run (test_determine_fit_function, tag='dummy')

running test_determine_fit_function


In [37]:
tst.run (test_use_fit_from_loaded_estimator, tag='dummy')

running test_use_fit_from_loaded_estimator
<block_types.utils.dummies.DummyEstimator object at 0x7fbfddfe9910>
before loading
<block_types.utils.dummies.DummyEstimator object at 0x7fbfddfe9910>
<bound method DummyEstimator.fit of <block_types.utils.dummies.DummyEstimator object at 0x7fbfddfe9910>>
<bound method DummyEstimator.transform of <block_types.utils.dummies.DummyEstimator object at 0x7fbfddfe9910>>
after loading
<block_types.utils.dummies.DummyEstimator object at 0x7fbfde009dd0>
<bound method DummyEstimator.fit of <block_types.utils.dummies.DummyEstimator object at 0x7fbfde009dd0>>
<bound method DummyEstimator.transform of <block_types.utils.dummies.DummyEstimator object at 0x7fbfde009dd0>>


### Use direct methods

In [38]:
# exports tests.core.test_block_types   
from block_types.utils.dummies import Multiply10direct, Max10direct
def test_direct_methods ():
    # input
    X = np.array ([1,2,3])
    
    # example where we do not use direct methods
    component = Max10direct (verbose=2)
    component.fit (X)
    r = component (X)
    assert (r==X*10+X.max()).all()
    
    component = Max10direct (verbose=2, error_if_apply=True)
    component.fit (X)
    with pytest.raises (RuntimeError):
        r = component (X)
    #assert component.fitted
    #assert component.applied
    
    # example where we use direct methods
    component = Max10direct (direct_apply=True, verbose=2, error_if_apply=True)
    component.logger.info (f'{"-"*100}')
    component.logger.info (f'direct_apply={component.direct_apply}, direct_fit={component.direct_fit}, direct_fit_apply={component.direct_fit_apply}\n')
    component.fit (X)
    r = component (X)
    assert (r==X*10+X.max()).all()
    #assert component.fitted
    #assert not component.applied
    
    component = Max10direct (direct_fit=True, verbose=2, error_if_fit=True)
    component.logger.info (f'{"-"*100}')
    component.logger.info (f'direct_apply={component.direct_apply}, direct_fit={component.direct_fit}, direct_fit_apply={component.direct_fit_apply}\n')
    component.fit (X)
    r = component.apply (X)
    assert (r==X*10+X.max()).all()
    #assert not component.fitted
    #assert component.applied
    
    component = Max10direct (direct_apply=True, direct_fit=True, verbose=2, error_if_apply=True, 
                             error_if_fit=True)
    component.logger.info (f'{"-"*100}')
    component.logger.info (f'direct_apply={component.direct_apply}, direct_fit={component.direct_fit}, direct_fit_apply={component.direct_fit_apply}\n')
    component.fit (X)
    r = component.transform (X)
    assert (r==X*10+X.max()).all()
    #assert not component.fitted
    #assert not component.applied
        
    # example when there is no _fit implemented and we call fit_apply
    component = Multiply10direct (verbose=2, error_if_fit=True)
    component.logger.info (f'{"-"*100}')
    component.logger.info (f'direct_apply={component.direct_apply}, direct_fit={component.direct_fit}, direct_fit_apply={component.direct_fit_apply}\n')
    r = component.fit_apply (X)
    assert (r==X*10).all()
    #assert not component.is_model
    assert component.fit == component._fit_
    #assert component.fit_apply == component.apply
    r2 = component.fit (X).apply (X)
    assert (r==X*10).all()

    # example when there is no _fit implemented and we want a direct apply call
    component = Multiply10direct (verbose=2, direct_apply=True, error_if_apply=True, error_if_fit=True)
    component.logger.info (f'{"-"*100}')
    component.logger.info (f'direct_apply={component.direct_apply}, direct_fit={component.direct_fit}, direct_fit_apply={component.direct_fit_apply}\n')
    r = component.fit_apply (X)
    assert (r==X*10).all()
    assert not component.is_model
    assert component.fit == component._fit_
    #assert component.fit_apply == component._apply
    #assert component.fit_transform == component._apply
    #assert not component.applied
    r2 = component.fit (X).apply (X)
    assert (r==X*10).all()
    #assert not component.applied

In [39]:
tst.run (test_direct_methods, tag='dummy')

fitting max10direct (using whole data)
applying max10direct (on whole data)


running test_direct_methods


fitting max10direct (using whole data)
----------------------------------------------------------------------------------------------------
direct_apply=True, direct_fit=False, direct_fit_apply=False

fitting max10direct (using whole data)
----------------------------------------------------------------------------------------------------
direct_apply=False, direct_fit=True, direct_fit_apply=False

applying max10direct (on whole data)
----------------------------------------------------------------------------------------------------
direct_apply=True, direct_fit=True, direct_fit_apply=False

----------------------------------------------------------------------------------------------------
direct_apply=False, direct_fit=False, direct_fit_apply=False

applying multiply10direct (on whole data)
applying multiply10direct (on whole data)
----------------------------------------------------------------------------------------------------
direct_apply=True, direct_fit=False, direct_fit_appl

### Pass apply function by parameter

In [40]:
# exports tests.core.test_block_types   
def test_pass_apply ():
    component = Component (apply=lambda x: x*10, verbose=2, direct_apply=True, error_if_apply=True)
    X = np.array ([1,2,3])
    r = component (X)
    assert (r==X*10).all()

In [41]:
tst.run (test_pass_apply, tag='dummy')

running test_pass_apply


### Get DataIO specific parameters

In [42]:
# exports tests.core.test_block_types   
def test_get_specific_data_io_parameters_for_component ():
    component = Component (tag='data', x=3, par=[1,2], path_results='hello', path_results_data='world', 
                           other='yes', load_result_data = False, save_model_data=True)
    check_last_part(component.path_results, 'world')
    assert component.data_io.load_result_flag == False
    assert component.data_io.save_model_flag == True

In [43]:
tst.run (test_get_specific_data_io_parameters_for_component, tag='dummy')

running test_get_specific_data_io_parameters_for_component


#### get_specific_data_io_parameters

In [44]:
# exports tests.core.test_utils
def test_get_specific_data_io_parameters ():
    component = Component ()
    config = component.get_specific_data_io_parameters (
        'data', **dict(x=3, par=[1,2], path_results='hello', path_results_data='world', other='yes', 
                       load_result_data = True))
    assert config == dict (path_results='world', load_result=True)

In [45]:
tst.run (test_get_specific_data_io_parameters, tag='dummy')

running test_get_specific_data_io_parameters


### StandardConverter

In [64]:
# exports tests.core.test_block_types   
from block_types.utils.dummies import Min10direct, SumXY

def test_standard_converter_in_component ():
    component = Min10direct (data_converter='StandardConverter')
    
    X, y = np.array([1,2,3]), np.array([0,1,0])

    Xr = component.fit_apply (X, y)
    assert (Xr==X*10+X.min()).all()
    
    Xr, yr = component.fit_apply (X, y, sequential_fit_apply=True)
    assert (Xr==X*10+X.min()).all()
    assert (yr==y).all()
    
    component = SumXY (data_converter='StandardConverter')
        
    Xr = component.fit_apply ((X,X*2), y=None)
    assert (Xr==X+X*2).all()
    
    Xr = component.fit_apply ((X,X*2), y=None, sequential_fit_apply=True)
    assert (Xr==X+X*2).all()
    
    #Xr, yr = component.fit_apply ((X,X*2), y, sequential_fit_apply=True)
    Xr, yr = component.fit_apply ((X,X*2), y, sequential_fit_apply=True)
    assert (Xr==X+X*2).all()
    assert (yr==y).all()

In [65]:
tst.run (test_standard_converter_in_component, tag='dummy')

running test_standard_converter_in_component


## SamplingComponent

In [49]:
#export
class SamplingComponent (Component):
    """
    Component that makes use of labels in transform method.
    
    When calling the transform method, one of the columns of the received data 
    is assumed to contain the ground-truth labels. This allows the transform 
    method to modify the number of observations, changing the number of rows in 
    the data and in the labels. See `PandasConverter` class in 
    `block_types.core.data_conversion`.
    """
    def __init__ (self, estimator=None, transform_uses_labels=True, **kwargs):

        # the SamplingComponent over-rides the following parameters:
        super().__init__ (estimator=estimator, transform_uses_labels=transform_uses_labels,
                          **kwargs)

### Usage example

In [50]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_sampling_component ():
    c = SamplingComponent (data_converter='DataConverter')
    assert c.transform_uses_labels
    assert not hasattr(c.data_converter,'transform_uses_labels')
    c = SamplingComponent (data_converter='PandasConverter')
    assert c.data_converter.transform_uses_labels

In [51]:
tst.run (test_sampling_component, tag='dummy')

running test_sampling_component


## SklearnComponent

In [52]:
#export
class SklearnComponent (Component):
    """
    Component that saves estimator parameters in pickle format.
    
    Convenience subclass used when the results can be saved in 
    pickle format. See `SklearnIO` class in `core.utils`.
    """
    def __init__ (self, estimator=None, data_io='SklearnIO', transform_uses_labels=False, 
                  **kwargs):

        super().__init__ (estimator=estimator, data_io=data_io, transform_uses_labels=False,
                          **kwargs)

# alias
PickleSaverComponent = SklearnComponent

### Usage example

In [53]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_sklearn_component ():
    c = SklearnComponent ()
    assert c.data_io.fitting_load_func==joblib.load
    assert c.data_io.result_save_func==joblib.dump

In [54]:
tst.run (test_sklearn_component, tag='dummy')

running test_sklearn_component


## NoSaverComponent

In [55]:
#export
class NoSaverComponent (Component):
    """Component that does not save any data."""
    def __init__ (self, estimator=None, data_io='NoSaverIO', **kwargs):
        
        super().__init__ (estimator=estimator, data_io=data_io, **kwargs)

### Usage example

In [56]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_no_saver_component ():
    c = NoSaverComponent ()
    assert c.data_io.__class__.__name__ == 'NoSaverIO'

In [57]:
tst.run (test_no_saver_component, tag='dummy')

running test_no_saver_component


## OneClassSklearnComponent

In [58]:
#export
class OneClassSklearnComponent (SklearnComponent):
    """Component that uses only normal data (labelled with 0) for fitting parameters."""
    def __init__ (self, estimator=None, **kwargs):
        super().__init__ (estimator=estimator, **kwargs)

    def _fit (self, X, y=None):
        assert y is not None, 'y must be provided in OneClassSklearnComponent class'
        X = X[y==0]

        assert self.estimator is not None, 'estimator must be provided in OneClassSklearnComponent class'
        self.estimator.fit (X, y)

### Usage example

In [59]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails

def get_data_for_one_class ():
    data = np.r_[np.ones ((5,2)), 2*np.ones((5,2))]
    y = np.r_[np.ones ((5,)), np.zeros((5,))]
    return data, y

def test_one_class_sklearn_component ():
    path_results = 'one_class_sklearn_component'
    remove_previous_results (path_results=path_results)
    
    data, y = get_data_for_one_class ()
    from sklearn.preprocessing import MinMaxScaler
    result1 = OneClassSklearnComponent (MinMaxScaler()).fit(data,y).transform (data)
    result2 = MinMaxScaler().fit(data[y==0]).transform (data)
    assert (result1==result2).all().all()
    
    remove_previous_results (path_results=path_results)

In [60]:
tst.run (test_one_class_sklearn_component, tag='dummy')

running test_one_class_sklearn_component


## PandasComponent

In [61]:
#export
class PandasComponent (Component):
    """
    Component that preserves the DataFrame format for incoming data and results.
    
    This component also writes results in parquet format, by default.
    See `PandasConverter` in `core.data_conversion` for details on the data 
    conversion performed.
    """
    def __init__ (self, estimator=None, data_converter='PandasConverter', data_io='PandasIO',
                  **kwargs):
        super().__init__ (estimator=estimator, data_converter=data_converter, data_io=data_io,
                          **kwargs)

### Usage example

In [62]:
# exports tests.core.test_block_types
#@pytest.mark.reference_fails
def test_pandas_component ():
    c = PandasComponent ()
    assert c.data_converter.__class__.__name__ == 'PandasConverter'
    assert c.data_io.__class__.__name__ == 'PandasIO'

In [63]:
tst.run (test_pandas_component, tag='dummy')

running test_pandas_component
