In [1]:
# hide
# default_exp core.compose
import os
from nbdev.showdoc import *
if not os.path.exists('settings.ini'):
    os.chdir('..')

# Compose transforms

> Classes and utilities for composed transforms

In [2]:
#export
import pandas as pd

from block_types.core.block_types import Component, PandasComponent, SamplingComponent
from block_types.core.data_conversion import PandasConverter
from block_types.core.utils import PandasIO

In [3]:
#export
class Pipeline (SamplingComponent):
    """
    Pipeline composed of a list of components that run sequentially.
    
    During training, the components of the list are trained one after the other, 
    where one component is fed the result of transforming the data with the list 
    of components located before in the pipeline.
    
    The `Pipeline` class is a subclass of `SamplingComponent`, which itself is a 
    subclass of `Component`. This provides the functionality of `Component` 
    to any implemented pipeline, such as logging the messages, loading / saving the 
    results, and convert the data format so that it can work as part of other 
    pipelines with potentially other data formats.
    
    Being a subclass of `SamplingComponent`, the `transform` method 
    receives an input data  `X` that contains both data and labels. 
    
    Furthermore, the Pipeline constructor sets `separate_labels=False` by default,
    which means that the `fit` method also receives an input data `X` that contains 
    not only data but also labels. This is necessary because some of the components in 
    the pipeline might be of class `SamplingComponent`, and such components 
    need the input data `X` to contain labels when calling `transform` (and note that 
    this method is called when calling `fit` on a pipeline, since we do `fit_transform`
    on all the components except for the last one)
    """
    def __init__ (self, separate_labels = False, **kwargs):
        """Assigns attributes and calls parent constructor.

        Parameters
        ----------
        separate_labels: bool, optional
            whether or not the fit method receives the labels in a separate `y` vector 
            or in the same input `X`, as an additional variable. See description of 
            Pipeline class for more details.
        """

        self.components = []

        # we need to create pipeline before calling super().__init__(), since the constructor of Component calls
        # a method that is overriden in Pipeline, and this method makes use of components field
        super().__init__ (separate_labels = separate_labels, 
                          **kwargs)

        self.set_training_data_flag(False)

    def _fit (self, X, y=None):
        """
        Fit components of the pipeline, given data X and labels y.
        
        By default, y will be None, and the labels are part of `X`, as a variable.
        """
        self.set_training_data_flag (True)
        for component in self.components[:-1]:
            X = component.fit_transform (X, y)
        self.components[-1].fit (X, y)
        # self.set_training_data_flag (False)

    def _predict (self, X):
        """Transform data with components of pipeline, and predict labels with last component. 
        
        In the current implementation, we consider prediction a form of mapping, 
        and therefore a special type of transformation."""
        self.set_training_data_flag (False)
        for component in self.components:
            X = component.transform (X)

        return X
    
    _transform = _predict

    def construct_diagram (self, training_data_flag=None, include_url=False, port=4000, project='block_types'):
        """
        Construct diagram of the pipeline components, data flow and dimensionality.
        
        By default, we use test data to show the number of observations 
        in the output of each component. This can be changed passing 
        `training_data_flag=True`
        """
        training_data_flag = self.get_training_data_flag (training_data_flag)

        if include_url:
            base_url = f'http://localhost:{port}/{project}'
        else:
            URL = ''

        node_name = 'data'
        output = 'train / test'

        f = Digraph('G', filename='fsm2.svg')
        f.attr('node', shape='circle')

        f.node(node_name)

        f.attr('node', shape='box')
        for component in self.components:
            last_node_name = node_name
            last_output = output
            node_name = component.model_plotter.get_node_name()
            if include_url:
                URL = f'{base_url}/{component.model_plotter.get_module_path()}.html#{node_name}'
            f.node(node_name, URL=URL)
            f.edge(last_node_name, node_name, label=last_output)
            output = component.model_plotter.get_edge_name(training_data_flag=training_data_flag)

        last_node_name = node_name
        node_name = 'output'
        f.attr('node', shape='circle')
        f.edge(last_node_name, node_name, label=output)

        return f

    def show_result_statistics (self, training_data_flag=None):
        """
        Show statistics about results obtained by each component. 
        
        By default, this is shown on test data, although this can change setting 
        `training_data_flag=True`
        """
        training_data_flag = self.get_training_data_flag (training_data_flag)

        for component in self.components:
            component.show_result_statistics(training_data_flag=training_data_flag)

    def show_summary (self, training_data_flag=None):
        """
        Show list of pipeline components, data flow and dimensionality.
        
        By default, we use test data to show the number of observations 
        in the output of each component. This can be changed passing 
        `training_data_flag=True`
        """
        training_data_flag = self.get_training_data_flag (training_data_flag)

        node_name = 'data'
        output = 'train / test'

        for i, component in enumerate(self.components):
            node_name = component.model_plotter.get_node_name()
            output = component.model_plotter.get_edge_name(training_data_flag=training_data_flag)
            print (f'{"-"*100}')
            print (f'{i}: {node_name} => {output}')


    def get_training_data_flag (self, training_data_flag=None):
        if training_data_flag is None:
            if self.data_io.training_data_flag is not None:
                training_data_flag = self.data_io.training_data_flag
            else:
                training_data_flag = False

        return training_data_flag

    def assert_equal (self, path_reference_results, assert_equal_func=pd.testing.assert_frame_equal, **kwargs):
        """Compare results stored in current run against reference results stored in given path."""

        for component in self.components:
            component.assert_equal (path_reference_results, assert_equal_func=assert_equal_func, **kwargs)
        self.logger.info ('both pipelines give the same results')
        print ('both pipelines give the same results')

    # *************************
    # setters
    # *************************
    def set_training_data_flag (self, training_data_flag):
        super().set_training_data_flag (training_data_flag)
        for component in self.components:
            component.set_training_data_flag (training_data_flag)

    def set_save_result_flag_test (self, save_result_flag_test):
        super().set_save_result_flag_test (save_result_flag_test)
        for component in self.components:
            component.set_save_result_flag_test (save_result_flag_test)

    def set_save_result_flag_training (self, save_result_flag_training):
        super().set_save_result_flag_training (save_result_flag_training)
        for component in self.components:
            component.set_save_result_flag_training (save_result_flag_training)

    def set_save_result_flag (self, save_result_flag):
        super().set_save_result_flag (save_result_flag)
        for component in self.components:
            component.set_save_result_flag (save_result_flag)

    def set_overwrite (self, overwrite):
        super().set_overwrite (overwrite)
        for component in self.components:
            component.set_overwrite (overwrite)

    def set_save_fitting (self, save_fitting):
        super().set_save_fitting (save_fitting)
        for component in self.components:
            component.set_save_fitting (save_fitting)

In [4]:
if False:
    show_doc (Pipeline, title_level=3)
    show_doc (Pipeline.__init__, name='__init__', title_level=4)
    show_doc (Pipeline.construct_diagram, name='construct_diagram', title_level=4)
    show_doc (Pipeline.show_summary, name='show_summary', title_level=4)
    show_doc (Pipeline.show_result_statistics, name='show_result_statistics', title_level=4)
    show_doc (Pipeline.assert_equal, name='assert_equal', title_level=4)

In [5]:
# export
def make_pipeline(*components, cls=Pipeline, **kwargs):
    """Create `Pipeline` object of class `cls`, given `components` list."""
    pipeline = cls (**kwargs)
    pipeline.components = list(components)
    return pipeline

In [6]:
from sklearn.preprocessing import FunctionTransformer

tr1 = Component(FunctionTransformer (lambda x: x+1))
tr2 = Component(FunctionTransformer (lambda x: x*2))
pipeline = make_pipeline (tr1, tr2)
result = pipeline.transform (3)

print (result)
assert result == 8

applying pipeline transform
applying function_transformer transform
applying function_transformer transform


8


In [7]:
# export
def pipeline_factory (pipeline_class, **kwargs):
    """Creates a pipeline object given its class `pipeline_class`
    
    Parameters
    ----------
    pipeline_class : class or str
        Name of the pipeline class used for creating the object. 
        This can be either of type string or class.
    """
    if type(pipeline_class) is str:
        Pipeline = eval(pipeline_class)
    elif type(pipeline_class) is type:
        Pipeline = pipeline_class
    else:
        raise ValueError (f'pipeline_class needs to be either string or class, we got {pipeline_class}')

    return Pipeline (**kwargs)

In [8]:
#export
class PandasPipeline (Pipeline):
    """
    Pipeline that saves results in parquet format, and preserves DataFrame format.
    
    See `Pipeline` class for an explanation of using `separate_labels=False`
    """
    def __init__ (self, 
                  data_converter=None,
                  data_io=None,
                  separate_labels=False,
                  **kwargs):
        if data_converter is None:
            data_converter = PandasConverter (separate_labels=separate_labels,
                                              **kwargs)
        if data_io is None:
            data_io = PandasIO (**kwargs)
        super().__init__ (self, 
                          data_converter=data_converter,
                          data_io=data_io,
                          **kwargs)

In [9]:
#export
class ColumnSelector (Component):
    def __init__ (self, 
                  columns=[],
                  **kwargs):
        super().__init__ (**kwargs)
        self.columns = columns
    
    def _transform (self, df):
        return df[self.columns]

In [10]:
df = pd.DataFrame ({'x1': list(range(5)),
                    'x2': list(range(5,10)),
                    'x3': list(range(15,20)),
                    'x4': list(range(25,30))
                   })
dfr = ColumnSelector(columns=['x2','x4']).transform(df)
assert (dfr==df[['x2','x4']]).all().all()

In [11]:
#export
class Concat (Component):
    def __init__ (self, 
                  **kwargs):
        super().__init__ (**kwargs)
        
    def _apply (self, *dfs):
        return pd.concat(list(dfs), axis=1)

In [12]:
# export
class _BaseColumnTransformer (Pipeline):
    def __init__ (self, **kwargs):
        super().__init__ (**kwargs)
        self.concat = Concat (**kwargs)
    
    def _fit (self, df, y=None):
        for component in self.components:
            component.fit (df)
        return self
    
    def _transform (self, df):
        dfs = []
        for component in self.components:
            dfs.append (component.transform (df))
        df_result = self.concat.transform (*dfs)
        return df_result
    
class ColumnTransformer (_BaseColumnTransformer):
    def __init__ (self, *transformers, **kwargs):
        self.components = make_column_transformer_pipelines (*transformers, **kwargs)
        super().__init__ (**kwargs)

In [13]:
# export
class Identity (Component):
    def __init__ (self, **kwargs):
        super ().__init__ (**kwargs)
        
    def _transform (self, X):
        return X
    
def make_column_transformer_pipelines (*transformers, **kwargs):
    pipelines = []
    for name, transformer, columns in transformers:
        if (type(transformer) is str) and transformer == 'passthrough':
            transformer = Identity (**kwargs)
        pipeline = make_pipeline(ColumnSelector(columns, **kwargs), 
                                 transformer, 
                                 name = name,
                                 **kwargs)
        pipelines.append (pipeline)
    
    return pipelines


def make_column_transformer (*transformers, **kwargs):
    transformers_with_name = []
    for transformer, columns in transformers:
        columns_name = ''.join([x[0] for x in columns])
        if len(columns_name) > 5:
            columns_name = columns_name[:5]
        if (type(transformer) is str) and transformer == 'passthrough':
            transformer_name = 'pass'
        elif hasattr(transformer, 'name'):
            transformer_name = transformer.name
        else:
            transformer_name = transformer.__class__.__name__
        name = f'{transformer_name}_{columns_name}'
        transformers_with_name.append ((name, transformer, columns))
    
    pipelines = make_column_transformer_pipelines (*transformers_with_name, **kwargs)
    column_transformer = _BaseColumnTransformer ()
    column_transformer.components = pipelines
    return column_transformer
    

In [47]:
import pandas as pd
from sklearn.preprocessing import FunctionTransformer

df = pd.DataFrame ({'cont1': list(range(5)),
                    'cont2': list(range(5,10)),
                    'cont3': list(range(15,20)),
                    'cont4': list(range(25,30)),
                    'cat_1': list([1,2,3,2,1]),
                    'cat_2': list([0,1,1,0,0])
                    })

tr1 = Component(FunctionTransformer (lambda x: x+1), name='tr1')
tr2 = PandasComponent(FunctionTransformer (lambda x: x*2), transformed_columns=['cont2_bis','cat_1'], name='tr2')

column_transformer = make_column_transformer (
    (tr1, ['cont2', 'cont4']),
    (tr2, ['cont2', 'cat_1'])
)
dfr = column_transformer.transform(df)

# display and test
display(dfr)
assert (dfr[['cont2','cont4']] == tr1(df[['cont2','cont4']])).all().all()
assert (dfr[['cont2_bis','cat_1']] == tr2(df[['cont2','cat_1']])).all().all()
assert (dfr.columns == ['cont2','cont4', 'cont2_bis','cat_1']).all()

applying __base_column_transformer transform
applying tr1_cc transform
applying column_selector transform
applying tr1 transform
applying tr2_cc transform
applying column_selector transform
applying tr2 transform
applying concat transform


Unnamed: 0,cont2,cont4,cont2_bis,cat_1
0,6,26,10,2
1,7,27,12,4
2,8,28,14,6
3,9,29,16,4
4,10,30,18,2


applying tr1 transform
applying tr2 transform


In [49]:
column_transformer = make_column_transformer (
    (tr1, ['cont1', 'cont4']),
    ('passthrough', ['cont2', 'cat_1'])
)
dfr = column_transformer.transform(df)

# display and test
display(dfr)
assert (dfr[['cont1','cont4']] == tr1(df[['cont1','cont4']])).all().all()
assert (dfr[['cont2','cat_1']] == df[['cont2','cat_1']]).all().all()
assert (dfr.columns == ['cont1','cont4', 'cont2','cat_1']).all()

applying __base_column_transformer transform
applying tr1_cc transform
applying column_selector transform
applying tr1 transform
applying pass_cc transform
applying column_selector transform
applying identity transform
applying concat transform


Unnamed: 0,cont1,cont4,cont2,cat_1
0,1,26,5,1
1,2,27,6,2
2,3,28,7,3
3,4,29,8,2
4,5,30,9,1


applying tr1 transform


In [44]:
class SumTimes100 (Component):
    def _fit (self, X, y=None):
        self.sum = X.sum(axis=0)
    def _transform (self, X):
        
        dfr = pd.DataFrame ({'c1_times100': self.sum.values[0]*100 + X.iloc[:,0].values,
                             'c2_times100': self.sum.values[1]*100 + X.iloc[:,1].values,
                             'c2_times1000': self.sum.values[1]*1000 + X.iloc[:,1].values})
        return dfr
        
tr1 = SumTimes100 ()
tr2 = PandasComponent(FunctionTransformer (lambda x: x*2), name='tr2')

column_transformer = make_column_transformer (
    (tr1, ['cont2', 'cont4']),
    (tr2, ['cont2', 'cat_1'])
)
dfr = column_transformer.fit_transform(df)

# display & test
display(dfr)
assert (dfr.columns == ['c1_times100','c2_times100', 'c2_times1000','cont2', 'cat_1']).all()
assert (dfr['c1_times100'] == sum(df.cont2)*100+df.cont2).all()
assert (dfr['c2_times100'] == sum(df.cont4)*100+df.cont4).all()
assert (dfr['c2_times1000'] == sum(df.cont4)*1000+df.cont4).all()

fitting __base_column_transformer
fitting sum_times100_cc
fitting column_selector
applying column_selector transform
fitting sum_times100
fitting tr2_cc
fitting column_selector
applying column_selector transform
fitting tr2
applying __base_column_transformer transform
applying sum_times100_cc transform
applying column_selector transform
applying sum_times100 transform
applying tr2_cc transform
applying column_selector transform
applying tr2 transform
applying concat transform


Unnamed: 0,c1_times100,c2_times100,c2_times1000,cont2,cat_1
0,3505,13525,135025,10,2
1,3506,13526,135026,12,4
2,3507,13527,135027,14,6
3,3508,13528,135028,16,4
4,3509,13529,135029,18,2
