In [None]:
# hide
# default_exp blocks.blocks
from nbdev.showdoc import *
from dsblocks.utils.nbdev_utils import nbdev_setup, TestRunner

nbdev_setup ()
tst = TestRunner (targets=[])

# Custom components

> Custom components like split generator

In [None]:
#export
import abc
import sklearn
import numpy as np

from dsblocks.core.components import Component
from dsblocks.config import bt_defaults as dflt

In [None]:
#for tests
import numpy as np
import pandas as pd

import pytest 
from sklearn.model_selection import KFold

## Splitter

In [None]:
#export
class Splitter (Component):
    def __init__ (self, training='train', validation='validation', test='test', 
                  split_col='split', **kwargs):
        super().__init__ (**kwargs)
    
    def _apply (self, df):
        result = dict(training=df[df[self.split_col]==self.training], 
                        validation=df[df[self.split_col]==self.validation], 
                        test=df[df[self.split_col]==self.test])
        return {k:result[k] for k in result if not result[k].empty}

### Example

In [None]:
# exports tests.blocks.test_blocks
def test_splitter ():
    df = pd.DataFrame ({'a': list(range(10)),
                        'b': list (range(10)),
                        'split': (['test','training','test','validation','test','training','validation']+
                                  ['test']*3)
                        })
    dict_results = Splitter (training='training')(df)

    reference = dict(training=[1,5], 
                     validation=[3,6], 
                     test=[0,2,4,7,8,9])
    for k in ['training', 'validation', 'test']:
        df = dict_results[k]
        assert (df.columns == ['a','b','split']).all()
        assert (df['split']==k).all()
        assert (df.a == reference[k]).all()
        assert (df.b == reference[k]).all()

In [None]:
tst.run (test_splitter, tag='dummy')

## DoubleKFold

### DoubleKFoldBase

In [None]:
#export
class DoubleKFoldBase (metaclass=abc.ABCMeta):
    def __init__ (self, cv, split_col='split', label_col='label', group_col=None, **kwargs):
        self.cv = cv
        self.n_splits = self.cv.get_n_splits ()
        self.split_col = split_col
        self.label_col = label_col
        self.group_col = group_col
    
    def get_n_splits (self):
        return self.n_splits
    
    @abc.abstractmethod
    def split (self, df, y=None, groups=None):
        pass

### SingleKFold

In [None]:
#export
class SingleKFold (DoubleKFoldBase):
    def __init__ (self, cv, **kwargs):
        super().__init__ (cv, **kwargs)
    def split (self, df, y=None, groups=None):
        groups = (groups if groups is not None 
                  else df[self.group_col] if self.group_col is not None 
                  else None)
        y = y if y is not None else df[self.label_col]
        self.generator = self.cv.split (df, y, groups=groups)
        empty_array = np.array([])
        for i in range(self.n_splits):
            training, validation = next (self.generator)
            yield training, validation, empty_array

#### Example / test

In [None]:
# exports tests.blocks.test_blocks
def test_single_kfold ():
    df = pd.DataFrame ({'a': list(range(10)),
                        'b': list (range(10)),
                        'label': [0]*5+[1]*5})

    cv2 = SingleKFold (KFold (5))
    generator = cv2.split (df)

    expected = (
        dict(training=[2, 3, 4, 5, 6, 7, 8, 9], validation=[0, 1]),
        dict(training=[0, 1, 4, 5, 6, 7, 8, 9], validation=[2, 3]),
        dict(training=[0, 1, 2, 3, 6, 7, 8, 9], validation=[4, 5]),
        dict(training=[0, 1, 2, 3, 4, 5, 8, 9], validation=[6, 7]),
        dict(training=[0, 1, 2, 3, 4, 5, 6, 7], validation=[8, 9])
    )

    for i in range (5):
        training, validation, test = next (generator)
        assert all(training==expected[i]['training'])
        assert all(validation==expected[i]['validation'])
        assert all(test == np.array([]))

In [None]:
tst.run (test_single_kfold, tag='dummy')

### FixedDoubleKFold

In [None]:
#export
class FixedDoubleKFold (DoubleKFoldBase):
    def __init__ (self, cv, input_test_label='test', **kwargs):
        super().__init__ (cv, **kwargs)
        self.input_test_label = input_test_label
        
    def split (self, df, y=None, groups=None):
        test = np.where(df[self.split_col]==self.input_test_label)[0]
        training_cond = df[self.split_col] != self.input_test_label
        training = np.where (training_cond)[0]
        
        groups = (groups[training] if groups is not None 
                  else df.loc[training_cond, self.group_col] if self.group_col is not None 
                  else None)
        y = (y[training] if y is not None else df.loc[training_cond, self.label_col])
        
        self.generator = self.cv.split (df[training_cond], y, groups=groups)
        
        for i in range(self.n_splits):
            training_training, training_validation = next (self.generator)
            validation_final, training_final = training[training_validation], training[training_training]
            yield training_final, validation_final, test

#### Example / test

In [None]:
# exports tests.blocks.test_blocks
def test_fixed_double_kfold ():
    df = pd.DataFrame ({'a': list(range(20)),
                        'b': list (range(20)),
                        'split': ['training','test']*10,
                        'label': ([0]*5+[1]*5)*2})

    cv2 = FixedDoubleKFold (KFold (5))
    generator = cv2.split (df)

    expected = (
        dict(training=[4, 6, 8, 10, 12, 14, 16, 18], validation=[0, 2], test=[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]),
        dict(training=[0, 2, 8, 10, 12, 14, 16, 18], validation=[4, 6], test=[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]),
        dict(training=[0, 2, 4, 6, 12, 14, 16, 18], validation=[8, 10], test=[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]),
        dict(training=[0, 2, 4, 6, 8, 10, 16, 18], validation=[12, 14], test=[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]),
        dict(training=[0, 2, 4, 6, 8, 10, 12, 14], validation=[16, 18], test=[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]),
    )

    for i in range (5):
        training, validation, test = next (generator)
        assert all(training==expected[i]['training'])
        assert all(validation==expected[i]['validation'])
        assert all(test==expected[i]['test'])

In [None]:
tst.run (test_fixed_double_kfold, tag='dummy')

## SkSplitGenerator

In [None]:
#export
class SkSplitGenerator (Component):
    def __init__ (self, split_generator, group_col=None, label_col=None, split_col=None, 
                  use_splitter=False, training_label='training', validation_label='validation', 
                  test_label='test', type_split='single', input_test_label='test', **kwargs):
        super ().__init__ (**kwargs)
        self.splitter = Splitter () if use_splitter else None
        self.generator = None
        if type_split == 'single':
            self.split_generator = SingleKFold (split_generator, split_col=split_col, label_col=label_col, 
                                                group_col=group_col)
            self.validation_label = self.test_label
        elif type_split == 'fixed':
            self.split_generator = FixedDoubleKFold (split_generator, input_test_label=input_test_label, 
                                                     split_col=split_col, label_col=label_col, 
                                                     group_col=group_col)
        else:
            raise NotImplementedError (f'type_split {type_split} not recognized')
            
    def _fit_apply (self, X, y=None, **kwargs):       
        if self.generator is None: self.generator = self.split_generator.split (X, y=y, **kwargs)
        training, validation, test = next (self.generator)
        X = self._create_split (X, training, validation, test)
        return X
    
    def _apply (self, X, **kwargs):
        training, validation, test = np.array([]), np.array([]), np.arange (X.shape[0])
        X = self._create_split (X, training, validation, test)
        return X
    
    def _create_split (self, X, training, validation, test):
        if self.split_col is not None:
            X[self.split_col] = None
            X[self.split_col].iloc[training] = self.training_label
            X[self.split_col].iloc[validation] = self.validation_label
            X[self.split_col].iloc[test] = self.test_label
        else:
            X = (X, (training, validation, test))
            
        if self.use_splitter:
            X = self.splitter (X)
        return X
    
    def reset (self):
        self.generator = None

### Example

In [None]:
# exports tests.blocks.test_blocks
def test_sksplit_generator ():
    df = pd.DataFrame ({'a': list(range(10)),
                       'b': list (range(10)),
                       'label': [0]*5+[1]*5})
    df_original = df.copy()
    generator = SkSplitGenerator (KFold (n_splits=5), 
                                  label_col='label', 
                                  split_col='split')

    reference = pd.concat ([df_original, pd.DataFrame({'split': ['test']*2 + ['training']*8})], axis=1)
    dfr=generator.fit_apply (df)
    assert (reference==dfr).all().all()
    
    dfr=generator.fit_apply (df)
    reference = pd.concat ([df_original, pd.DataFrame({'split': ['training']*2 + ['test']*2 + ['training']*6})], 
                           axis=1)
    assert (reference==dfr).all().all()

    dfr=generator.apply (df)
    reference = pd.concat ([df_original, pd.DataFrame({'split': ['test']*10})], axis=1)
    assert (reference==dfr).all().all()

In [None]:
tst.run (test_sksplit_generator, tag='dummy')

### Resetting generator

In [None]:
# exports tests.blocks.test_blocks
def test_sksplit_generator_reset ():
    df = pd.DataFrame ({'a': list(range(9)),
                       'b': list (range(9)),
                       'label': [0]*5+[1]*4})
    df_original = df.copy()
    generator = SkSplitGenerator (KFold (n_splits=5), 
                                  label_col='label', 
                                  split_col='split')
    for i in range(5):
        dfr=generator.fit_apply (df)
    with pytest.raises (StopIteration):
        dfr=generator.fit_apply (df)
    
    generator.reset()
    for i in range(5):
        dfr=generator.fit_apply (df)
    with pytest.raises (StopIteration):
        dfr=generator.fit_apply (df)

In [None]:
tst.run (test_sksplit_generator_reset, tag='dummy')

running test_sksplit_generator_reset


## Evaluator

### Evaluator

In [None]:
#export
class Evaluator (Component, metaclass=abc.ABCMeta):
    def __init__ (self, classification_metrics='accuracy_score', regression_metrics=[], custom_metrics=[], 
                  **kwargs):
        classification_metrics = self._get_metrics (classification_metrics)
        regression_metrics = self._get_metrics (regression_metrics)
        super().__init__ (**kwargs)
        self.apply_to_separate_splits = True
    
    def _get_metrics (self, metrics):
        metrics = [metrics] if isinstance (metrics, str) else metrics
        for i, metric in enumerate(metrics):
            metrics[i] = getattr(sklearn.metrics, metrics[i]) if isinstance(metrics[i], str) else metrics[i]
        return metrics
    
    @abc.abstractmethod
    def _apply (self, df, **kwargs):
        pass

### PandasEvaluator

In [None]:
#export
class PandasEvaluator (Evaluator):
    def __init__ (self, groundtruth_col='label', prediction_col='pred', classification_col='classification', 
                  **kwargs):
        super().__init__ (**kwargs)

    def _apply (self, df, **kwargs):
        dict_results = {metric.__name__: metric (df[self.groundtruth_col], df[self.classification_col]) 
                        for metric in self.classification_metrics}
        dict_results.update( {metric.__name__: metric (df[self.groundtruth_col], df[self.prediction_col]) 
                                for metric in self.regression_metrics})
        for metric in self.custom_metrics:
            dict_results.update (metric (df, label_col=self.groundtruth_col, prediction_col=self.prediction_col, 
                                         classification_col=self.classification_col))
        return dict_results

### Example

In [None]:
# exports tests.blocks.test_blocks
def test_pandas_evaluator ():
    df = pd.DataFrame ({'a': list(range(10)),
                       'b': list (range(10)),
                       'label': [0]*5+[1]*5,
                       'classification': [0]*4+[1]*6})
    assert PandasEvaluator ()(df) == {'accuracy_score': 0.9}

    evaluator = PandasEvaluator (classification_metrics=['accuracy_score', 'auc'], 
                                 regression_metrics=['mean_squared_error', 'max_error'],
                                 prediction_col='classification')
    assert evaluator (df)=={'accuracy_score': 0.9, 'auc': 1.0, 'mean_squared_error': 0.1, 'max_error': 1}

In [None]:
tst.run (test_pandas_evaluator, tag='dummy')