In [1]:
# hide
# default_exp core.block_types
import os
from nbdev.showdoc import *
if not os.path.exists('settings.ini'):
    os.chdir('..')
    
from block_types.core.block_types import __all__

# Block types

> Types of blocks

In [2]:
#export
from functools import partialmethod
from typing import Optional
import copy
import pickle
from pathlib import Path

from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import joblib
from IPython.display import display

try:
    from graphviz import *
    imported_graphviz = True
except:
    imported_graphviz = False

# block_types
from block_types.core.data_conversion import DataConverter, NoConverter, PandasConverter, data_converter_factory
from block_types.core.utils import (save_csv, 
                                    save_parquet, 
                                    save_multi_index_parquet, 
                                    save_keras_model, 
                                    save_csv_gz, 
                                    read_csv, 
                                    read_csv_gz)
from block_types.core.utils import DataIO, SklearnIO, PandasIO, NoSaverIO, ModelPlotter, Profiler
from block_types.core.utils import camel_to_snake
from block_types.utils.utils import (set_logger,
                                     replace_attr_and_store, 
                                     get_specific_dict_param, 
                                     get_hierarchy_level)
import block_types.config.bt_defaults as dflt

## Component

In [3]:
#export

class Component (ClassifierMixin, TransformerMixin, BaseEstimator):
    """Base component class used in our Pipeline."""
    def __init__ (self,
                  estimator=None,
                  name: Optional[str] = None,
                  group: str = 'group_0',
                  data_converter: Optional[DataConverter] = None,
                  data_io: Optional[DataIO] = None,
                  model_plotter: Optional[ModelPlotter] = None,
                  logger=None,
                  verbose: int = dflt.verbose,
                  name_logger:str = dflt.name_logger,
                  **kwargs):
        
        """
        Initialize attributes and fields.
        
        Parameters
        ----------
        estimator : estimator (classifier or transformer) or None, optional
            Estimator being wrapped.
        name : Pipeline or None, optional
            Name of component. If not provided, it is inferred from the name of the 
            estimator's class, or the name of the custom class defining the componet.
        data_converter : DataConverter or None, optional
            Converts incoming data to format expected by component, and convert 
            outgoing result to format expected by caller.
        data_io : DataIO or None, optional
            Manages data serialization and deserialization.
        model_plotter : ModelPlotter or None, optional
            Helper object that allows to retrieve information to be shown about this 
            component, as part of a Pipeline diagram.
        logger : logging.logger or None, optional
            Logger used to write messages
        verbose : int, optional
            Verbosity, 0: warning or critical, 1: info, 2: debug.
        """

        # name of current component, for logging and plotting purposes
        self._determine_component_name (name, estimator)
        
        # obtain hierarchy_level
        self.hierarchy_level = get_hierarchy_level (base_class=Component)
        
        # store __init__ attrs into `self`
        replace_attr_and_store (base_class=Component)
        
        # obtain class-specific kwargs
        kwargs = self.obtain_config_params (**kwargs)
        
        if self.logger is None:
            self.logger = set_logger (self.name_logger, verbose=self.verbose)

        # object that manages loading / saving
        if self.data_io is None:
            self.data_io = DataIO (component=self, **kwargs)
        else:
            self.data_io = copy.copy(self.data_io)
            self.data_io.setup (self)

        # data converter
        if self.data_converter is None:
            # TODO: have DataConverter store a reference to component, and use the logger from that reference.
            self.data_converter = NoConverter (**kwargs)
        else:
            self.data_converter = data_converter_factory (self.data_converter, 
                                                          **kwargs)
        # plotting model component
        if self.model_plotter is None:
            self.model_plotter = ModelPlotter (component=self, **kwargs)
        else:
            self.model_plotter.set_component (self)
            
        # profiling computational cost
        self.profiler = Profiler (self, **kwargs)
        
    def obtain_config_params (self, **kwargs):
        """Overwrites parameters in kwargs with those found in a dictionary of the same name 
        as the component.
        
        Checks if there is a parameter whose name is the name of the class or the name given 
        to this component. In that case, it overwrites the parameters in kwargs with those 
        found in that dictionary. The parameters in kwargs can be used as *global* parameters
        for multiple components, while parameters specific of one component can be overwritten 
        using a dictionary with the name of that component. See example below.
        """
        k = get_specific_dict_param (self, **kwargs)
        
        if k is not None:
            config = kwargs.copy()
            config.update (config[k])
        else:
            config = kwargs
            
        config.update(verbose=self.verbose, 
                      logger=self.logger)
        
        return config

    def _determine_component_name (self, name: Optional[str], estimator) -> None:
        """
        Determines an appropriate name for the component if not provided by input.
        
        If not provided, it is inferred from the name of the estimator's class, or 
        the name of the custom class defining the componet.
        """
        self.class_name = self.__class__.__name__
        if (self.class_name in __all__) and (estimator is not None):
            self.class_name = estimator.__class__.__name__

        if name is not None:
            self.name = name
        else:
            self.name = camel_to_snake (self.class_name)
            
    def fit_like (self, X, y=None, load=None, save=None, split=None,
                  func='_fit', validation_data=None, test_data=None, **kwargs):
        """
        Estimates the parameters of the component based on given data X and labels y.
        
        Uses the previously fitted parameters if they're found in disk and load 
        is True.
        """
        self.profiler.start_timer ()
        
        if split is not None:
            self.original_split = self.data_io.split
            self.set_split (split)

        self.logger.info (f'fitting {self.name} (using {self.data_io.split} data)')
            
        previous_estimator = None
        if self.data_io.can_load_model (load):
            previous_estimator = self.data_io.load_estimator()
            
        if previous_estimator is None:
            X, y = self.data_converter.convert_before_fitting (X, y)
            additional_data= self._add_validation_and_test (validation_data, test_data)
            if func=='_fit':
                if len(kwargs) > 0:
                    raise AttributeError (f'kwargs: {kwargs} not valid')
                self._fit (X, y, **additional_data)
            elif func=='_fit_apply':
                fit_apply_func = self._determine_fit_apply_func ()
                assert fit_apply_func is not None, ('object must have _fit_apply method or one of '
                                                    'its aliases implemented when func="_fit_apply"')
                result = fit_apply_func (X, y=y, **additional_data, **kwargs)
            else:
                raise ValueError (f'function {func} not valid')
            self.data_converter.convert_after_fitting (X)
            if self.data_io.can_save_model (save):
                self.data_io.save_estimator ()
        else:
            self.estimator = previous_estimator
            self.logger.info (f'loaded pre-trained {self.name}')
            
        self.profiler.finish_timer (method=func, split=self.data_io.split)
            
        if split is not None:
            self.set_split (self.original_split)
            
        if func=='_fit':
            return self
        else:
            return result
     
    fit = partialmethod (fit_like, func='_fit')
    
    def fit_apply (self, X, y=None, load_model=None, save_model=None, 
                   load_result=None, save_result=None, func='_fit', 
                   validation_data=None, test_data=None, **kwargs):
        
        if self._determine_fit_apply_func () is not None:
            return self.fit_like (X, y=y, 
                                  load=load_model, save=save_model, 
                                  func='_fit_apply', validation_data=validation_data,
                                  test_data=test_data, **kwargs)
        else:
            return self.fit (X, y=y, 
                             load=load_model, save=save_model, 
                             validation_data=validation_data, 
                             test_data=test_data).apply (X, load=load_result, 
                                                         save=save_result, **kwargs)
    
    def _add_validation_and_test (self, validation_data, test_data):
        additional_data = {}
        def add_data (data, split_name):
            if data is not None:
                if isinstance(data, tuple):
                    if len(data) > 0:
                        newX = data[0]
                    else:
                        self.logger.warning (f'empty {split_name}')
                        newX = None
                    if len(data) == 2:
                        newy = data[1]
                    elif len(data)==1:
                        newy = None
                    elif len(data)>2:
                        raise ValueError (f'{split_name} must have at most 2 elements')
                else:
                    newX = data
                    newy = None
                newX, newy = self.data_converter.convert_before_fitting (newX, newy)
                if newy is not None:
                    additional_data[split_name] = (newX, newy)
                else:
                    additional_data[split_name] = newX
        
        add_data (validation_data, 'validation_data')
        add_data (test_data, 'test_data')
        
        return additional_data
    
    # aliases
    fit_transform = fit_apply
    fit_predict = fit_apply

    def apply (self, *X, load=None, save=None, **kwargs):
        """
        Transforms the data X and returns the transformed data.
        
        Uses the previously transformed data if it's found in disk and load 
        is True.
        """
        self.profiler.start_timer ()
        result_func = self._determine_result_func ()
        result = self._compute_result (X, result_func, load=load, save=save, **kwargs)
        return result

    def _determine_result_func (self):
        implemented = []
        if callable(getattr(self, '_apply', None)):
            result_func = self._apply
            implemented += [result_func]
        if callable(getattr(self, '_transform', None)):
            result_func = self._transform
            implemented += [result_func]
        if callable(getattr(self, '_predict', None)):
            result_func = self._predict
            implemented += [result_func]
        if len(implemented)==0:
            if self.estimator is not None and callable(getattr(self.estimator, 'transform', None)):
                result_func = self.estimator.transform
                implemented += [result_func]
            if self.estimator is not None and callable(getattr(self.estimator, 'predict', None)):
                result_func = self.estimator.predict
                implemented += [result_func]
        if len (implemented) == 0:
            raise AttributeError (f'{self.class_name} must have one of _transform, _apply, or _predict methods implemented\n'
                                  f'Otherwise, self.estimator must have either predict or transform methods')
        if len(implemented) > 1:
            raise AttributeError (f'{self.class_name} must have only one of _transform, _apply, '
                                  f'or _predict methods implemented => found: {implemented}')
        return result_func
    
    def _determine_fit_apply_func (self):
        implemented = []
        result_func = None
        if callable(getattr(self, '_fit_apply', None)):
            result_func = self._fit_apply
            implemented += [result_func]
        if callable(getattr(self, '_fit_transform', None)):
            result_func = self._fit_transform
            implemented += [result_func]
        if callable(getattr(self, '_fit_predict', None)):
            result_func = self._fit_predict
            implemented += [result_func]
        if len(implemented)==0:
            if self.estimator is not None and callable(getattr(self.estimator, 'fit_transform', None)):
                result_func = self.estimator.fit_transform
                implemented += [result_func]
            if self.estimator is not None and callable(getattr(self.estimator, 'fit_predict', None)):
                result_func = self.estimator.fit_predict
                implemented += [result_func]
        if len(implemented) > 1:
            raise AttributeError (f'{self.class_name} must have only one of fit_transform, fit_apply, '
                                  f'or fit_predict methods implemented => found: {implemented}')
        return result_func
    
    # aliases for transform method
    __call__ = apply
    transform = apply
    predict = partialmethod (apply, converter_args=dict(new_columns=['prediction']))

    def _compute_result (self, X, result_func, load=None, save=None, split=None,
                         converter_args={}, **kwargs):
        
        if split is not None:
            self.original_split = self.data_io.split
            self.set_split (split)
            
        self.logger.info (f'applying {self.name} (on {self.data_io.split} data)')
            
        if len(X) == 1:
            X = X[0]
        previous_result = None
        if self.data_io.can_load_result (load):
            previous_result = self.data_io.load_result (split=split)
        if previous_result is None:
            X = self.data_converter.convert_before_transforming (X, **converter_args)
            if type(X) is tuple:
                result = result_func (*X, **kwargs)
            else:
                result = result_func (X, **kwargs)
            result = self.data_converter.convert_after_transforming (result, **converter_args)
            if self.data_io.can_save_result (save, split):
                self.data_io.save_result (result, split=split)
        else:
            result = previous_result
            self.logger.info (f'loaded pre-computed result')
            
        self.profiler.finish_timer ('apply', self.data_io.split)
        if split is not None:
            self.set_split (self.original_split)
            
        return result


    def _fit (self, X, y=None):
        if self.estimator is not None:
            self.estimator.fit (X, y)
            
    def show_result_statistics (self, result=None, training_data_flag=False) -> None:
        """
        Show statistics of transformed data.
        
        Parameters
        ----------
        result: DataFrame or other data structure or None, optional
            Transformed data whose statistics we show. If not provided, it is loaded 
            from disk.
        training_data_flag: bool, optional
            If True, transformed training data is loaded, otherwise transformed test 
            data is loaded.
        """
        if result is None:
            self.set_training_data_flag (training_data_flag)
            df = self.data_io.load_result()
        else:
            df = result
        
        if df is not None:
            display (self.name)
            if callable(getattr(df, 'describe', None)):
                display (df.describe())

    def assert_equal (self, path_reference_results: str, assert_equal_func=pd.testing.assert_frame_equal, **kwargs):
        """
        Check whether the transformed data is the same as the reference data stored in given path.
        
        Parameters
        ----------
        path_reference_results: str
            Path where reference results are stored. The path does not include the 
            file name, since this is stored as a field of data_io.
        assert_equal_func: function, optional
            Function used to check whether the values are the same. By defaut, 
            `pd.testing.assert_frame_equal` is used, which assumes the data type is 
            DataFrame.
        
        """
        type_result = 'training' if self.data_io.training_data_flag else 'test'
        self.logger.info (f'comparing {type_result} results for {self.class_name}')
        
        self.logger.info (f'loading...')
        current_results = self.data_io.load_result ()
        if self.data_io.training_data_flag:
            path_to_reference_file = Path(path_reference_results) / self.data_io.result_file_name_training
        else:
            path_to_reference_file = Path(path_reference_results) / self.data_io.result_file_name_test
        reference_results = self.data_io._load (path_to_reference_file, self.data_io.result_load_func)
        self.logger.info (f'comparing...')
        assert_equal_func (current_results, reference_results, **kwargs)
        self.logger.info (f'equal results\n')

    # ********************************
    # exposing some data_io and data_converters methods
    # ********************************
    def load_estimator (self):
        estimator = self.data_io.load_estimator ()
        if estimator is not None:
            self.estimator = estimator
            
    def load_result (self, split=None):
        return self.data_io.load_result (split=split)
        
    # ********************************
    # setters
    # ********************************
    def set_split (self, split):
        self.data_io.set_split (split)
    
    def set_save_splits (self, save_splits):
        self.data_io.set_save_splits (save_splits)

    def set_save_model (self, save_model):
        self.data_io.set_save_model (save_model)
        
    def set_load_model (self, load_model):
        self.data_io.set_load_model (load_model)
        
    def set_save_result (self, save_result):
        self.data_io.set_save_result (save_result)
        
    def set_load_result (self, load_result):
        self.data_io.set_load_result (load_result)
        
    def set_data_io (self, data_io, copy=False):
        self.data_io = copy.copy(data_io) if copy else data_io
        self.data_io.setup (self)

# ******************************************
# Subclasses of Component.
# Most of these are basically the same as GenericComponent, the only difference being that some parameters
# are over-riden when constructing the object, to force a specific behavior
# ******************************************

### Tests for Component

#### Configuring component with global and specific parameters

In [4]:
import block_types.config.bt_defaults as dflt

# **********************************************************************
# test obtain_config_params method
# **********************************************************************
tr = Component(name='sky')
config = dict(first=1,
              second=2,
              third=3,
              sky=dict (second=4)
             )
config_r = tr.obtain_config_params (**config)
logger = set_logger (dflt.name_logger, verbose=dflt.verbose)
assert config_r=={'first': 1, 'second': 4, 'third': 3, 'sky': {'second': 4}, 'verbose': dflt.verbose, 'logger': logger}
assert config == {'first': 1, 'second': 2, 'third': 3, 'sky': {'second': 4}}

# **********************************************************************
# test that component saves resuls when using global 
# parameter save=True
# **********************************************************************
class MyTransform (Component):
    def __init__ (self,**kwargs):
        super().__init__ (**kwargs)
        
    def _fit (self, X, y=None):
        self.mu = X.mean()
    def _transform (self, X):
        return X-self.mu

path_results = 'testing_configuration'
tr = MyTransform (data_io = SklearnIO(
                                path_results=path_results,
                                save = True))

X = np.array([[1,2,3],[4,5,6]])
tr.fit_transform(X)

import os
l = sorted(os.listdir(path_results))
assert l==['models','whole'], f'found: {l}'

# **********************************************************************
# test that component does not save results when we 
# use component-specific parameter MyTransform = dict(save=False)
# **********************************************************************
from block_types.utils.utils import remove_previous_results
remove_previous_results (path_results)

tr = MyTransform (data_io = SklearnIO(
                              path_results='testing_configuration',
                              save = True,
                              MyTransform = dict(save=False)
                            )
                 )
tr.fit_transform(X)
import pytest
with pytest.raises(FileNotFoundError):
    os.listdir(path_results)

AssertionError: 

In [None]:
# recursively storing __init__ attrs across hiearchy of classes
class Intermediate (Component):
    def __init__ (self, x=3, y=4, **kwargs):
        super().__init__ (**kwargs)

class Final (Intermediate):
    def __init__ (self, z=6, h=[2,3,5], **kwargs):
        super().__init__ (**kwargs)

o = Final (x=9, h=[1,2,4])
assert o.x==9 and o.y==4 and o.z==6 and o.h==[1,2,4]

o = Final (y=7, z=10, h=[1,2,4], Final={'h': [9,11,10]})
assert o.x==3 and o.y==7 and o.z==10 and o.h==[9,11,10]

# only attributes specific of Final are replaced.
# trying to replace attributes specific of Intermediate 
# does not work
o = Final (y=7, z=10, h=[1,2,4], Intermediate={'y': 12})
assert o.x==3 and o.y==7 and o.z==10 and o.h==[1,2,4]

In [None]:
class Intermediate (Component):
    def __init__ (self, x=3, y=4, **kwargs):
        super().__init__ (**kwargs)

class Final (Intermediate):
    def __init__ (self, z=6, h=[2,3,5], **kwargs):
        super().__init__ (**kwargs)

o = Final (x=9, h=[1,2,4], group='group_1', group_1={'y': 10, 'z':60})
assert o.x==9 and o.y==10 and o.z==60 and o.h==[1,2,4]

#### Transform method called with different aliases

In [None]:
import pytest

# test that we can implement _transform and use all the aliases 
# (transform, predict, apply,  __call__)
class MyTransform (Component):
    def _apply (self, x):
        return x*2

my_transform = MyTransform()
assert my_transform.transform (3) == 6
assert my_transform.predict (3) == 6
assert my_transform.apply (3) == 6
assert my_transform (3) == 6

# test that we can implement _apply and use all the aliases 
# (transform, predict, apply and __call__)
class MyTransform2 (Component):
    def _apply (self, x):
        return x*2

my_transform2 = MyTransform2()
assert my_transform2.transform (3) == 6
assert my_transform2.predict (3) == 6
assert my_transform2.apply (3) == 6
assert my_transform2 (3) == 6

# test that we can implement _predict and use all the aliases 
# (transform, predict, apply and __call__)
class MyTransform3 (Component):
    def _predict (self, x):
        return x*2

my_transform3 = MyTransform3()
assert my_transform3.transform (3) == 6
assert my_transform3.predict (3) == 6
assert my_transform3.apply (3) == 6
assert my_transform3 (3) == 6

# test that an exception is raised if neither _tranform nor _apply are defined
class MyTransform4 (Component):
    def _wrong_method (self, x):
        return x*2
 
my_transform4 = MyTransform4 ()

import pytest
with pytest.raises(Exception):
    my_transform4.transform(3)
    

# test that an exception is raised if more than one alias is implemented
class MyTransform5 (Component):
    def _predict (self, x):
        return x*2
    def _apply (self, x):
        return x*2
 
my_transform5 = MyTransform5 ()

import pytest
with pytest.raises(Exception):
    my_transform5.transform(3)

#### Calling `predict` is handy when the result is a single array of predictions

In [None]:
# TODO: remove this cell
if False:
    class MyTransform (Component):
        def __init__ (self, **kwargs):
            super().__init__ (
                data_converter=PandasConverter(**kwargs),
                **kwargs)

        def _predict (self, x):
            return x['a']+x['b']

    my_transform = MyTransform()

    df = pd.DataFrame ({'a': [10,20,30],'b':[4,5,6]})

    pd.testing.assert_frame_equal(my_transform.transform (df).to_frame(), 
                                  pd.DataFrame ({0: [14,25,36]})
                                 )

    if False:
        pd.testing.assert_frame_equal(my_transform.predict (df), 
                                      pd.DataFrame ({0: [14,25,36]})
                                     )

#### The `transform` method and its aliases can be called with multiple inputs

In [None]:
# test that we can apply tranform to multiple data items
class MyTransform (Component):
    def _apply (self, x, y):
        return x+y

my_transform = MyTransform ()
result = my_transform.transform (3, 4)
print (result)
assert result==7

# test that we can apply tranform to single data items
class MyTransform2 (Component):
    def _apply (self, x):
        return x*2

my_transform2 = MyTransform2 ()
result = my_transform2.transform (3)
print (result)
assert result==6

#### `fit_apply()` and its aliases `fit_transform(), fit_predict()`

`_fit_apply()` is called when implemented, otherwise `fit().apply()` is called

In [None]:
import numpy as np 

# example with _fit_apply implemented
class Transform1 (Component):
    def __init__ (self, **kwargs):
        super().__init__ (**kwargs)
    def _fit (self, X, y=None):
        self.sum = X.sum(axis=0)
    def _apply (self, X):
        return X + self.sum
    def _fit_apply (self, X, y=None):
        self.sum = X.sum(axis=0)*10
        return X + self.sum

tr1 = Transform1 ()
X = np.array ([100, 90, 10])
result = tr1.fit_apply (X)
assert (result==(X+2000)).all()

# same result obtained by aliases
result = tr1.fit_transform (X)
assert (result==(X+2000)).all()
    
# example without _fit_apply implemented
class Transform2 (Component):
    def __init__ (self, **kwargs):
        super().__init__ (**kwargs)
    def _fit (self, X, y=None):
        self.sum = X.sum(axis=0)
    def _apply (self, X):
        return X + self.sum

tr2 = Transform2 ()
result = tr2.fit_apply (X)
assert (result==(X+200)).all()

# same result obtained by aliases
result = tr2.fit_transform (X)
assert (result==(X+200)).all()

#### Getting validation_data and test_data

In [None]:
class Transform1 (Component):
    def __init__ (self, **kwargs):
        super().__init__ (**kwargs)
    def _fit (self, X, y=None, validation_data=None, test_data=None):
        self.sum = X.sum(axis=0)
        
        print (f'validation_data: {validation_data}')
        print (f'test_data: {test_data}')
        
        self.validation_data = validation_data
        self.test_data = test_data
                
    def _apply (self, X):
        return X + self.sum

tr1 = Transform1 ()
X = np.array ([100, 90, 10])

# case 1: validation_data and test_data are not tuples
validation_data = np.array ([100, 90, 10])*10
test_data = np.array ([100, 90, 10])*100
result = tr1.fit_apply (X, validation_data=validation_data, test_data=test_data)
assert (tr1.validation_data==validation_data).all()
assert (tr1.test_data==test_data).all()

# case 2: validation_data is a tuple, and test_data is not given
result = tr1.fit_apply (X, validation_data=(validation_data,1))
assert (tr1.validation_data[0]==validation_data).all()
assert tr1.validation_data[1]==1
assert tr1.test_data is None

# case 3: validation_data is a tuple with more than 2 elements, exception is raised
import pytest
with pytest.raises(ValueError):
    result = tr1.fit_apply (X, validation_data=(validation_data,1,2))

#### saving / loading

In [None]:
from block_types.utils.utils import remove_previous_results

path_results = 'component_loading_saving'
remove_previous_results (path_results=path_results)

import numpy as np 
from sklearn.utils import Bunch

# example with _fit_apply implemented
class Transform1 (Component):
    def __init__ (self, **kwargs):
        super().__init__ (data_io = SklearnIO (**kwargs),
                          **kwargs)
        self.estimator = Bunch(sum=0)
    def _fit (self, X, y=None):
        self.estimator.sum = X.sum(axis=0)
    def _apply (self, X):
        return X + self.estimator.sum
    
tr1 = Transform1 (path_results=path_results)
X = np.array ([100, 90, 10])
tr1.fit (X)
result = tr1.apply (X)

tr2 = Transform1 (path_results=path_results)
tr2.load_estimator()
assert tr2.estimator.sum == tr1.estimator.sum

result2 = tr2.data_io.load_result ()
assert (result2 == sum(X)+X).all()

import os

assert os.listdir (f'{path_results}/whole')==['transform1_result.pk']
assert os.listdir (f'{path_results}/models')==['transform1_estimator.pk']

result_b = tr1.apply (X*2, split='test')
result2b = tr2.data_io.load_result (split='test')
assert (result_b==result2b).all()
assert os.listdir (f'{path_results}/test')==['transform1_result.pk']

result2b = tr2.data_io.load_result ()
assert (result_b!=result2b).all()

remove_previous_results (path_results=path_results)


# Test that no saving is done if save=False
tr1 = Transform1 (path_results=path_results,
                 save=False)
tr1.fit (X)
result = tr1.apply (X)
assert not os.path.exists(path_results)


#### Logger

In [None]:
tr1 = Transform1 (verbose=0)
tr1.fit (X)
result = tr1.apply (X)

tr1 = Transform1 (verbose=1)
tr1.fit (X)
result = tr1.apply (X)

In [None]:
#show_doc (Component, name='Component', title_level=3);
#show_doc (Component.__init__, name='__init__', title_level=4);

#### Passing data_converter and data_io

In [None]:
class MyTransform (Component):
    def __init__ (self, **kwargs):
        super().__init__ (self, 
                          data_converter='PandasConverter',
                          **kwargs)
    def _apply (self, x):
        return x*2

my_transform = MyTransform (separate_labels=False)
assert my_transform.data_converter.separate_labels is False
assert type(my_transform.data_converter) is PandasConverter

# example where data-converter uses class-specific parameters
config = dict(separate_labels=False, MyTransform=dict(separate_labels=True))
my_transform = MyTransform (**config)
assert my_transform.data_converter.separate_labels is True
assert config['separate_labels'] is False

In [None]:
# example using data_io
import pandas as pd
from block_types.utils.utils import remove_previous_results

path_results = 'test_data_io'
remove_previous_results (path_results=path_results)

class MyTransform (Component):
    def __init__ (self, **kwargs):
        super().__init__ (self, 
                          result_io='pandas',
                          **kwargs)
    def _fit (self, X, y=None):
        self.estimator = Bunch(sum=100)
        
    def _apply (self, x):
        return pd.DataFrame ([[1,2],[3,4]], columns=['a','b'])

my_transform = MyTransform (path_results='do_not_use', MyTransform=dict(path_results=path_results))
my_transform.fit (1)
assert os.listdir (f'{path_results}/models')==['my_transform_estimator.pk']

df1 = my_transform.apply (1)
assert os.listdir (f'{path_results}/whole')==['my_transform_result.parquet']

assert not os.path.exists ('do_not_use')

del my_transform
my_transform = MyTransform (path_results='do_not_use', MyTransform=dict(path_results=path_results))
#assert my_transform.estimator is None
my_transform.load_estimator()
assert my_transform.estimator == Bunch(sum=100)

df2 = my_transform.load_result ()
pd.testing.assert_frame_equal (df1, df2)

remove_previous_results (path_results=path_results)

In [None]:
#export
class SamplingComponent (Component):
    """
    Component that makes use of labels in transform method.
    
    When calling the transform method, one of the columns of the received data 
    is assumed to contain the ground-truth labels. This allows the transform 
    method to modify the number of observations, changing the number of rows in 
    the data and in the labels. See `PandasConverter` class in 
    `block_types.core.data_conversion`.
    """
    def __init__ (self,
                  estimator=None,
                  transform_uses_labels=True,
                  **kwargs):

        # the SamplingComponent over-rides the following parameters:
        super().__init__ (estimator=estimator,
                          transform_uses_labels=transform_uses_labels,
                          **kwargs)

In [None]:
#show_doc (SamplingComponent, title_level=3)

In [None]:
#export
class SklearnComponent (Component):
    """
    Component that saves estimator parameters in pickle format.
    
    Convenience subclass used when the results can be saved in 
    pickle format. See `SklearnIO` class in `core.utils`.
    """
    def __init__ (self,
                  estimator=None,
                  data_io=None,
                  transform_uses_labels=False,
                  **kwargs):

        if data_io is None:
            data_io = SklearnIO (**kwargs)
        
        super().__init__ (estimator=estimator,
                          data_io = data_io,
                          transform_uses_labels=False,
                          **kwargs)

# alias
PickleSaverComponent = SklearnComponent

In [None]:
#show_doc (SklearnComponent, name = 'SklearnComponent', title_level=3)

In [None]:
#export
class NoSaverComponent (Component):
    """Component that does not save any data."""
    def __init__ (self,
                  estimator=None,
                  data_io=None,
                  **kwargs):

        if data_io is None:
            data_io = NoSaverIO (**kwargs)
        
        super().__init__ (estimator=estimator,
                          data_io=data_io,
                          **kwargs)

In [None]:
#show_doc (NoSaverComponent, name = 'SklearnComponent', title_level=3)

In [None]:
#export
class OneClassSklearnComponent (SklearnComponent):
    """Component that uses only normal data (labelled with 0) for fitting parameters."""
    def __init__ (self,
                  estimator=None,
                  **kwargs):
        super().__init__ (estimator=estimator,
                          **kwargs)

    def _fit (self, X, y=None):
        assert y is not None, 'y must be provided in OneClassSklearnComponent class'
        X = X[y==0]

        assert self.estimator is not None, 'estimator must be provided in OneClassSklearnComponent class'
        self.estimator.fit (X, y)

In [None]:
#show_doc (OneClassSklearnComponent, name = 'OneClassSklearnComponent', title_level=3)

In [None]:
#export
class PandasComponent (Component):
    """
    Component that preserves the DataFrame format for incoming data and results.
    
    This component also writes results in parquet format, by default.
    See `PandasConverter` in `core.data_conversion` for details on the data 
    conversion performed.
    """
    def __init__ (self,
                  estimator=None,
                  data_converter=None,
                  data_io=None,
                  **kwargs):

        if data_converter is None:
            data_converter = PandasConverter (**kwargs)
        if data_io is None:
            data_io = PandasIO (**kwargs)

        super().__init__ (estimator=estimator,
                          data_converter=data_converter,
                          data_io=data_io,
                          **kwargs)

In [None]:
#show_doc (PandasComponent, name='PandasComponent', title_level=3)