In [1]:
# default_exp calculators

In [2]:
#export
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
import numexpr as ne



This is a slightly different approach to try. What about making a set of "calculators". It

In the abstract, an `AbstractCalculator` contains a set of expected fields and operations for calculating resulting values.
With these it can manage creating descriptions of how the fields are manipulated through a set of `AbstractOperation`s.
It can handle processing the relavant values from dataframes.
Then, it can manage the visualization of both an interactive calculator and cohort visualization.

In [3]:
from nbdev.showdoc import *


%load_ext autoreload
%autoreload 2

In [66]:
#export
class AbstractCalculator(object):
    fields = []
    operations = []
    inferred_cols = []

    def __init__(self, name, operations):
        """

        Parameters
        ----------
        fields : list[str]
        transforms : list[AbstractOperation]
        """

        self.name = name
        self.operations = operations

        fields = sum((op.fields for op in operations), start = [])
        fields = set(fields)

        inferred = sum((op.result_fields for op in operations), start = [])
        inferred = set(inferred)

        self.fields = sorted(fields-inferred)
        self.inferred_cols = sorted(inferred)

    def to_series(self, row):

        series = pd.Series(dict((field, row.get(field)) for field in self.fields))
        return series
    
    def explain(self, row):

        ins = [f'{f}:{row[f]}' for f in self.fields]
        print('Taking:', ', '.join(ins))

        res = self.process_single(row, explain=True)
        outs = [f'{f}:{res[f]}' for f in self.inferred_cols]
        print('Resulting in:', ', '.join(outs))

    def __add__(self, other):

        return AbstractCalculator(self.name, self.operations+other.operations)

    def process_single(self, row, explain=False):
        """

        Parameters
        ----------
        row : pd.Series,dict
        explain : bool

        Returns
        -------

        """

        data = self.to_series(row)
        if self.operations:
            for operation in self.operations:
                for field, val in operation(data):
                    data[field] = val
                if explain:
                    print(operation.explain(data))

            #print(data)
        return pd.Series(data)


    def process_dataframe(self, df, mapping = None):
        """

        Parameters
        ----------
        df : pd.DataFrame
        mapping : dict
        Returns
        -------
        pd.DataFrame

        """

        if mapping is not None:
            clean_data = df.rename(columns=mapping)
        else:
            clean_data = df

        #print(clean_data[self.fields])

        res = clean_data.apply(self.process_single, axis=1)
        return res


In [113]:
#export
class AbstractOperation(object):

    fields = []
    result_fields = []

    @staticmethod
    def from_config(config):
        op_classes = [EquationOperation,
                      AggregationOperation,
                      ClipOperation,
                      NormativeLookup,
                      BinnedScalingOperator,
                      RegressionNormOperator]

        for op_class in op_classes:
            op = op_class.from_config(config)
            if op is not None:
                return op
        raise NotImplementedError(f'Did not understand type: {config["type"]}')
        #return None

    def process_single(self, row):
        raise NotImplementedError

    def explain(self, row):
        raise NotImplementedError


    def to_series(self, row):

        series = pd.Series(dict((field, row.get(field)) for field in self.fields))
        return series

    def __call__(self, row):

        res = self.process_single(row)
        yield self.result_fields[0], res

Again, we'll uss the BVMT test as the example.
But we're going to back up a step. Since there are a bunch of different intermediate values, I want to calculate those using operations.
We'll also want these easily defined in yaml, so, I'm using the `numexpr` library to allow easy filtering.

In [114]:
#export

class EquationOperation(AbstractOperation):

    def __init__(self, out_field, equation, fields):

        self.fields = fields
        self.equation = equation
        self.result_fields = [out_field]

    @staticmethod
    def from_config(config):
        if config['type'] == 'equation':
            return EquationOperation(config['out_field'],
                                     config['equation'],
                                     config['fields'])
        return None

    def explain(self, row):
        res = self.process_single(row)
        return f'Used Equation: {self.equation} = {res} = {self.result_fields[0]}'

    def process_single(self, row):

        data = self.to_series(row)
        #print(data)
        if data.notnull().all():
            res = ne.evaluate(self.equation, local_dict=data)
        else:
            res = np.nan
        return res

In [115]:

row = {'trial1': 5, 'trial2': 6, 'trial3': 7,
       'delay': 7, 'hits': 6, 'false_pos': 2,
       'copy': 12}

total_op = EquationOperation('immediate', 'trial1+trial2+trial3', row.keys())
immed = total_op.process_single(row)

assert immed == 18
print(total_op.explain(row))

Used Equation: trial1+trial2+trial3 = 18 = immediate


Because of numexpr limitations if we want to do reductions, we'll need another type.

In [116]:
class AggregationOperation(AbstractOperation):

    def __init__(self, out_field, aggregation, fields):

        self.fields = fields
        self.aggregation = aggregation
        self.result_fields = [out_field]

    @staticmethod
    def from_config(config):
        if config['type'] == 'agg':
            return AggregationOperation(config['out_field'],
                                     config['method'],
                                     config['fields'])
        return None

    def explain(self, row):
        res = self.process_single(row)
        return f'Aggregation: {self.aggregation} [{", ".join(self.fields)}]  = {res}'

    def process_single(self, row):

        data = self.to_series(row)
        return data.agg(self.aggregation)


class ClipOperation(AbstractOperation):
    def __init__(self, field, lower = 0, upper=1):

        self.fields = [field]
        self.lower = lower
        self.upper = upper
        self.result_fields = [field]

    @staticmethod
    def from_config(config):
        if config['type'] == 'clip':
            return ClipOperation(config['field'],
                                 lower = config['lower'],
                                 upper = config['upper'])
        return None

    def explain(self, row):
        return f'Clipped {self.fields[0]} to [{self.lower}, {self.upper}]'

    def process_single(self, row):

        data = self.to_series(row)
        clipped = data.clip(lower=self.lower, upper=self.upper)
        return clipped[self.result_fields[0]]


In [117]:
class TestCalculcator(AbstractCalculator):

    def __init__(self, name, operations):
        super().__init__(name, operations)

    @staticmethod
    def from_config(config):
        name = config['short_name']
        ops = [AbstractOperation.from_config(c) for c in config['operations']]
        return TestCalculcator(name, ops)



In [118]:
import yaml

bvmt_calc = TestCalculcator.from_config(yaml.full_load(open('data/test_calculators/BVMT.yaml')))
bvmt_calc.explain(row)

NameError: name 'RegressionNormOperator' is not defined

In [None]:
tdf = pd.DataFrame([row]*5)
bvmt_calc.process_dataframe(tdf)

In [None]:
all_neuro = pd.read_excel('data/neuro_data.xlsx',
                          na_values=['na', '-', 'nd']).groupby(['VisitDate', 'PatientID']).first()
all_neuro.head()

In [119]:
bvmt_calc.process_dataframe(all_neuro.dropna(subset = ['BVMTtrial1', 'BVMTdelay']).head(),
                            mapping = {'BVMTtrial1': 'trial1',
                                       'BVMTtrial2': 'trial2',
                                       'BVMTtrial3': 'trial3',
                                       'BVMTdelay': 'delay',
                                       'BVMThits': 'hits',
                                       'BVMTfalsepos': 'false_pos'})

Unnamed: 0_level_0,Unnamed: 1_level_0,delay,false_pos,hits,trial1,trial2,trial3,immediate,retention_denom,retention,recognition
VisitDate,PatientID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-10-03,A0165,6.0,0.0,5.0,6.0,6.0,7.0,19.0,6.0,1.0,5.0
2013-10-08,A0127,6.0,0.0,6.0,5.0,6.0,9.0,20.0,6.0,1.0,6.0
2013-10-08,A0164,7.0,2.0,6.0,5.0,6.0,7.0,18.0,6.0,1.0,4.0
2013-10-10,A0421,7.0,0.0,6.0,6.0,8.0,9.0,23.0,8.0,0.875,6.0
2013-10-15,A0067,4.0,1.0,4.0,2.0,4.0,4.0,10.0,4.0,1.0,3.0


Awesome, a generic method for aggregating tests.
Can we then follow this with another normalization scheme.

In [120]:
#export

class NormativeLookup(AbstractOperation):

    def __init__(self, lookup_table, filter_cols, measure_col, out_name):

        self.lookup_table = lookup_table
        self.filter_cols = filter_cols
        self.fields = filter_cols + [measure_col]
        self.result_fields = [out_name]
        self.measure_col = measure_col

    @staticmethod
    def from_config(config):

        if config['type'] == 'normative_lookup':

            return NormativeLookup(config['table'],
                                   config['filter_cols'],
                                   config['measure_col'],
                                   config['out_name'])
        return None

    def lookup_norm(self, row):

        data = self.to_series(row)
        for filt in self.lookup_table:
            if ne.evaluate(filt['filter'], local_dict=data):
                return filt['filter'], filt['mean'], filt['std']

        return None, None, None

    def explain(self, row):

        flt, mean, std = self.lookup_norm(row)

        if flt is None:
            data = self.to_series(row)
            return f'Could not find matching filter for {data[self.filter_cols]}'
        else:
            return f'Matched {flt}, Expecting {mean} +- {std}'

    def process_single(self, row):

        data = self.to_series(row)
        _, mean, std = self.lookup_norm(data)
        if mean is None:
            return np.nan
        return (data[self.measure_col] - mean)/std


In [121]:
path = 'data/norms/from_kate/BVMTR/description.yaml'

heaton_bvmt_calc = TestCalculcator.from_config(yaml.full_load(open(path)))

heaton_bvmt_calc.process_single({'Age': 22,
                                 'immediate': 30,
                                 'retention': 0.91,
                                 'delay': 11,
                                 'recognition': 2})

NameError: name 'RegressionNormOperator' is not defined

In [122]:
full_calc = bvmt_calc + heaton_bvmt_calc
full_calc.process_dataframe(all_neuro.dropna(subset = ['BVMTtrial1', 'BVMTdelay']).head(),
                            mapping = {'BVMTtrial1': 'trial1',
                                       'BVMTtrial2': 'trial2',
                                       'BVMTtrial3': 'trial3',
                                       'BVMTdelay': 'delay',
                                       'BVMThits': 'hits',
                                       'BVMTfalsepos': 'false_pos'})


Unnamed: 0_level_0,Unnamed: 1_level_0,Age,delay,false_pos,hits,trial1,trial2,trial3,immediate,retention_denom,retention,recognition,heaton_immediate,heaton_retention,heaton_delay,heaton_recognition
VisitDate,PatientID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-10-03,A0165,62.0,6.0,0.0,5.0,6.0,6.0,7.0,19.0,6.0,1.0,5.0,-0.679035,0.724138,-1.215962,-1.954545
2013-10-08,A0127,62.0,6.0,0.0,6.0,5.0,6.0,9.0,20.0,6.0,1.0,6.0,-0.493506,0.724138,-1.215962,0.318182
2013-10-08,A0164,56.0,7.0,2.0,6.0,5.0,6.0,7.0,18.0,6.0,1.0,4.0,-0.998106,0.711744,-0.882927,-2.948276
2013-10-10,A0421,63.0,7.0,0.0,6.0,6.0,8.0,9.0,23.0,8.0,0.875,6.0,0.172727,-0.172414,-0.622727,0.166667
2013-10-15,A0067,47.0,4.0,1.0,4.0,2.0,4.0,4.0,10.0,4.0,1.0,3.0,-2.856016,0.776084,-2.763158,-8.142857


In [123]:
#export

class BinnedScalingOperator(AbstractOperation):
    def __init__(self, bins, measure_col):

        self.fields = [measure_col]
        self.result_fields = [measure_col+'_scaled']
        self.bins = sorted(bins, key = lambda x: x['min'],
                           reverse=True)

    @staticmethod
    def from_config(config):

        if config['type'] == 'binned_scaling':
            return BinnedScalingOperator(config['bins'],
                                         config['measure_col'])
        return None

    def lookup_bin(self, row):

        data = self.to_series(row)
        val = data[self.fields[0]]
        if val == val:
            for bin in self.bins:
                if val >= bin['min']:
                    return bin['min'], bin['scaled']
            return np.nan, np.nan
        else:
            return np.nan, np.nan

    def explain(self, row):

        edge, scaled = self.lookup_bin(row)

        if edge != edge:
            data = self.to_series(row)
            return f'Could not find matching bin for {data[self.fields[0]]}'
        else:
            return f'{self.fields[0]} matched {edge}, scaled to {scaled}'

    def process_single(self, row):
        _, res = self.lookup_bin(row)
        return res

In [177]:
path = 'data/norms/norman/norman_scaling.yaml'
calc = TestCalculcator.from_config(yaml.full_load(open(path)))

norman_scaled_calc = bvmt_calc + calc
norman_scaled_calc.explain(row)


Taking: delay:7, false_pos:2, hits:6, trial1:5, trial2:6, trial3:7
Used Equation: trial1+trial2+trial3 = 18 = immediate
Aggregation: max [trial1, trial2]  = 6
Used Equation: delay/retention_denom = 1.1666666666666667 = retention
Clipped retention to [0, 1]
Used Equation: hits-false_pos = 4 = recognition
delay matched 7, scaled to 6
immediate matched 16, scaled to 6
Resulting in: delay_scaled:6, immediate:18, immediate_scaled:6, recognition:4, retention:1, retention_denom:6


In [182]:
#export

class RegressionNormOperator(AbstractOperation):

    def __init__(self, regressions, fields, out_field, result_type = 'zscale'):

        self.regressions = regressions
        self.fields = fields
        self.result_fields = [out_field]
        self.result_type = result_type

    @staticmethod
    def from_config(config):
        if config['type'] == 'regression_norm':
            return RegressionNormOperator(config['regressions'],
                                          config['fields'],
                                          config['out_field'],
                                          result_type = config['result_type'])

        return None

    def search_filters(self, row):

        data = self.to_series(row)
        check_func = lambda reg: pd.eval(reg['filter'], local_dict=data.to_dict())
        return [reg for reg in self.regressions if check_func(reg)]

    def scale_data(self, row):

        data = self.to_series(row)
        hits = self.search_filters(row)
        if hits: #Currently only implementing "first"
            reg = hits[0]
            val = pd.eval(reg['norm'], local_dict=data.to_dict())
            return reg, val
        return None, None


    def explain(self, row):

        data = self.to_series(row)
        reg, val = self.scale_data(row)

        if reg is None:
            return 'Could not find a match for regression normalization.'
        else:
            return f'Matched {reg["filter"]}, applied {reg["norm"]} = {float(val)}'


    def process_single(self, row):

        _, val = self.scale_data(row)
        return val



In [184]:
reg_calc = TestCalculcator.from_config(yaml.full_load(open('data/norms/norman/norman_bvmt_regnorm.yaml')))

full_norman = norman_scaled_calc + reg_calc
demo_info = {'age': 54,
             'education': 6,
             'race_cat': 1,
             'norman_gender': 1}

full_norman.explain({**row, **demo_info})

#reg_calc.operations[0].regressions[1]['filter']

Taking: age:54, delay:7, education:6, false_pos:2, hits:6, norman_gender:1, race_cat:1, trial1:5, trial2:6, trial3:7
Used Equation: trial1+trial2+trial3 = 18 = immediate
Aggregation: max [trial1, trial2]  = 6
Used Equation: delay/retention_denom = 1.1666666666666667 = retention
Clipped retention to [0, 1]
Used Equation: hits-false_pos = 4 = recognition
delay matched 7, scaled to 6
immediate matched 16, scaled to 6
Matched (race_cat == 1) & ((age >= 18) & (age <= 66)), applied ((immediate_scaled-(0.2834*(education-13.86)+(-0.1125)*(age-40.63)+1.0394*norman_gender + 8.0679))/2.5701)*10 + 50 = 52.42927901638068
Matched (race_cat == 1) & ((age >= 18) & (age <= 66)), applied ((delay_scaled-(0.2267*(education-13.86) + (-0.1262)*(age-40.63) + 0.8593*norman_gender + 7.691))/2.5197)*10 + 50 = 53.64668809778942
Resulting in: delay_scaled:6, immediate:18, immediate_scaled:6, norman_delay:53.64668809778942, norman_immediate:52.42927901638068, recognition:4, retention:1, retention_denom:6


Unnamed: 0,Age__gte,Age__lte,BVMTimmed__mean,BVMTimmed__std
0,18,21,28.74,4.32
1,20,23,28.44,4.38
2,24,25,27.83,4.48
3,26,29,27.52,4.54
4,30,33,26.92,4.64
