In [2]:
# hide
# default_exp blocks.blocks
from nbdev.showdoc import *
from block_types.utils.nbdev_utils import nbdev_setup, TestRunner

nbdev_setup ()
tst = TestRunner (targets=['dummy'])

# Custom components

> Custom components like split generator

In [15]:
#export
import sklearn
import numpy as np

from block_types.core.block_types import Component
from block_types.config import bt_defaults as dflt

In [4]:
#for tests
import numpy as np
import pandas as pd

import pytest 
from sklearn.model_selection import KFold

## SkSplitGenerator

In [5]:
#export
class SkSplitGenerator (Component):
    def __init__ (self, split_generator, group_column=None, label_column=None, split_column=None, 
                  use_splitter=False, **kwargs):
        super ().__init__ (**kwargs)
        self.splitter = Splitter () if use_splitter else None
        self.generator = None
            
    def _fit_apply (self, X, y=None, **kwargs):
        if y is None: 
            if self.label_column is not None: 
                y = X[self.label_column] 
            else:
                raise ValueError ('either label_column must be set or y is passed')
        groups = X[self.group_column] if self.group_column is not None else None
        if self.generator is None: self.generator = self.split_generator.split (X, y, groups, **kwargs)
        train, test = next (self.generator)
        X = self._create_split (X, train, test)
        return X
    def _apply (self, X, **kwargs):
        train, test = np.array([]), np.arange (X.shape[0])
        X = self._create_split (X, train, test)
        return X
    
    def _create_split (self, X, train, test):
        if self.split_column is not None:
            X[self.split_column] = None
            X[self.split_column].iloc[train] = 'training'
            X[self.split_column].iloc[test] = 'test'
        else:
            X = (X, (train, test))
        if self.use_splitter:
            X = self.splitter (X)
        return X

### Example

In [6]:
# exports tests.blocks.test_blocks
def test_sksplit_generator ():
    df = pd.DataFrame ({'a': list(range(10)),
                       'b': list (range(10)),
                       'label': [0]*5+[1]*5})
    df_original = df.copy()
    generator = SkSplitGenerator (KFold (n_splits=5), 
                                  label_column='label', 
                                  split_column='split')

    reference = pd.concat ([df_original, pd.DataFrame({'split': ['test']*2 + ['training']*8})], axis=1)
    dfr=generator.fit_apply (df)
    assert (reference==dfr).all().all()
    dfr=generator.fit_apply (df)

    reference = pd.concat ([df_original, pd.DataFrame({'split': ['training']*2 + ['test']*2 + ['training']*6})], axis=1)
    assert (reference==dfr).all().all()

    dfr=generator.apply (df)
    reference = pd.concat ([df_original, pd.DataFrame({'split': ['test']*10})], axis=1)
    assert (reference==dfr).all().all()

In [7]:
tst.run (test_sksplit_generator, tag='dummy')

running test_sksplit_generator


## Evaluator

In [21]:
#export
class PandasEvaluator (Component):
    def __init__ (self, classification_metrics='accuracy_score', regression_metrics=[], custom_metrics=[], 
                  groundtruth_col='label', prediction_col='pred', classification_col='classification', **kwargs):
        classification_metrics = self._get_metrics (classification_metrics)
        regression_metrics = self._get_metrics (regression_metrics)
        super().__init__ (**kwargs)
        
    def _get_metrics (self, metrics):
        metrics = [metrics] if isinstance (metrics, str) else metrics
        for i, metric in enumerate(metrics):
            metrics[i] = getattr(sklearn.metrics, metrics[i]) if isinstance(metrics[i], str) else metrics[i]
        return metrics
    def _apply (self, df, **kwargs):
        dict_results = {metric.__name__: metric (df[self.groundtruth_col], df[self.classification_col]) 
                        for metric in self.classification_metrics}
        dict_results.update( {metric.__name__: metric (df[self.groundtruth_col], df[self.prediction_col]) 
                                for metric in self.regression_metrics})
        for metric in self.custom_metrics:
            dict_results.update (metric (df, label_col=self.groundtruth_col, prediction_col=self.prediction_col, 
                                         classification_col=self.classification_col))
        return dict_results

### Example

In [23]:
# exports tests.blocks.test_blocks
def test_pandas_evaluator ():
    df = pd.DataFrame ({'a': list(range(10)),
                       'b': list (range(10)),
                       'label': [0]*5+[1]*5,
                       'classification': [0]*4+[1]*6})
    assert PandasEvaluator ()(df) == {'accuracy_score': 0.9}

    evaluator = PandasEvaluator (classification_metrics=['accuracy_score', 'auc'], 
                                 regression_metrics=['mean_squared_error', 'max_error'],
                                 prediction_col='classification')
    assert evaluator (df)=={'accuracy_score': 0.9, 'auc': 1.0, 'mean_squared_error': 0.1, 'max_error': 1}

In [24]:
tst.run (test_pandas_evaluator, tag='dummy')

running test_pandas_evaluator
