In [1]:
# default_exp calculators

In [2]:
#hide
#export
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
import numexpr as ne

from autoneuro import operators


# Calculators

This is a slightly different approach to try. What about making a set of "calculators". It

In the abstract, an `AbstractCalculator` contains a set of expected fields and operations for calculating resulting values.
With these it can manage creating descriptions of how the fields are manipulated through a set of `AbstractOperation`s.
It can handle processing the relevant values from dataframes.
Then, it can manage the visualization of both an interactive calculator and cohort visualization.

In [3]:
from nbdev.showdoc import *

%load_ext autoreload
%autoreload 2

In [4]:
#export
class AbstractCalculator(object):
    fields = []
    operations = []
    inferred_cols = []

    def __init__(self, name, operations):
        """

        Parameters
        ----------
        name : list[str]
        operations : list[AbstractOperation]
        """

        self.name = name
        self.operations = operations

        fields = sum((op.fields for op in operations), start = [])
        fields = set(fields)

        inferred = sum((op.result_fields for op in operations), start = [])
        inferred = set(inferred)

        self.fields = sorted(fields-inferred)
        self.inferred_cols = sorted(inferred)

    def to_series(self, row):

        series = pd.Series(dict((field, row.get(field)) for field in self.fields))
        return series
    
    def explain(self, row):

        ins = [f'{f}:{row[f]}' for f in self.fields]
        print('Input:', ', '.join(ins))

        res = self.process_single(row, explain=True)
        outs = [f'{f}:{res[f]}' for f in self.inferred_cols]
        print('Resulting in:', ', '.join(outs))

    def __add__(self, other):

        return AbstractCalculator(self.name, self.operations+other.operations)

    def process_single(self, row, explain=False):
        """

        Parameters
        ----------
        row : pd.Series,dict
        explain : bool

        Returns
        -------

        """

        data = self.to_series(row)
        if self.operations:
            for operation in self.operations:
                cfields = []
                for field, val in operation(data):
                    data[field] = val
                    cfields.append(field)
                if explain:
                    explanation = operation.explain(data)
                    print(f'Calculating: {cfields}')
                    print(explanation, '\n')


            #print(data)
        return pd.Series(data)


    def process_dataframe(self, df, mapping = None):
        """

        Parameters
        ----------
        df : pd.DataFrame
        mapping : dict
        Returns
        -------
        pd.DataFrame

        """

        if mapping is not None:
            clean_data = df.rename(columns=mapping)
        else:
            clean_data = df

        #print(clean_data[self.fields])

        res = clean_data.apply(self.process_single, axis=1)
        return res

However, you'll probably never need to use that directly.
Instead, you'll likely use the `TestCalculator`object.


In [5]:
#export

class TestCalculator(AbstractCalculator):

    def __init__(self, name, operations):
        super().__init__(name, operations)

    @staticmethod
    def from_config(config):
        name = config['short_name']
        ops = [operators.AbstractOperation.from_config(c) for c in config['operations']]
        return TestCalculator(name, ops)

The `TestCalculator` holds a sequential list of `operators.AbstractOperation` objects to perform on a single object.
While one could create these entirely in Python, its more likely that you'll load these from a set of `yaml` files.
Here's the example for the BVMT test.

In [6]:
!cat data/test_calculators/BVMT.yaml

name: 'Brief Visiospatial Memory Test'
short_name: 'BVMT'


operations:
  - type: equation
    equation: 'trial1+trial2+trial3'
    fields: ['trial1', 'trial2', 'trial3']
    out_field: immediate
  - type: agg
    method: 'max'
    fields: ['trial2', 'trial3']
    out_field: retention_denom
  - type: equation
    equation: 'delay/retention_denom'
    fields: [ 'delay', 'retention_denom']
    out_field: retention
  - type: clip
    field: retention
    lower: 0
    upper: 1
  - type: equation
    equation: 'hits-false_pos'
    fields: ['hits', 'false_pos']
    out_field: 'recognition'



It defines how to calculate the `immediate`, `retention`, and `recognition` values from the raw measurements.
We can load this in from the `yaml` file easily.

We'll imagine an individual to test.

Measured Values:
 - `trial1` - 5
 - `trial2` - 6
 - `trial3` - 7
 - `delay` - 8
 - `hits` - 6
 - `false_pos` - 2
 - `copy` - 12


In [7]:
import yaml

DATA = {'trial1': 5, 'trial2': 6, 'trial3': 7,
        'delay': 8, 'hits': 6, 'false_pos': 2,
        'copy': 12}


bvmt_calc = TestCalculator.from_config(yaml.full_load(open('data/test_calculators/BVMT.yaml')))
result = bvmt_calc.process_single(DATA)

assert result['immediate'] == 18
assert result['recognition'] == 4
assert result['retention'] == 1 # clipped from a raw 8/7

bvmt_calc.explain(DATA)

Taking: delay:8, false_pos:2, hits:6, trial1:5, trial2:6, trial3:7
Used Equation: trial1+trial2+trial3 = 18 = immediate
Aggregation: max [trial2, trial3]  = 7
Used Equation: delay/retention_denom = 1.1428571428571428 = retention
Clipped retention to [0, 1]
Used Equation: hits-false_pos = 4.0 = recognition
Resulting in: immediate:18.0, recognition:4.0, retention:1.0, retention_denom:7.0


Sweet! Everything got calculated automatically and I have easy descriptions of how each value was calculated.
What if I need to process a lot of data?

Put it into a `pd.DataFrame` and then use the `process_dataframe` method.

In [8]:
tdf = pd.DataFrame([DATA]*5)
bvmt_calc.process_dataframe(tdf)

Unnamed: 0,delay,false_pos,hits,trial1,trial2,trial3,immediate,retention_denom,retention,recognition
0,8.0,2.0,6.0,5.0,6.0,7.0,18.0,7.0,1.0,4.0
1,8.0,2.0,6.0,5.0,6.0,7.0,18.0,7.0,1.0,4.0
2,8.0,2.0,6.0,5.0,6.0,7.0,18.0,7.0,1.0,4.0
3,8.0,2.0,6.0,5.0,6.0,7.0,18.0,7.0,1.0,4.0
4,8.0,2.0,6.0,5.0,6.0,7.0,18.0,7.0,1.0,4.0


Awesome, how do we deal with our full neuro data?
There are lots of different column names?
Easy, the `process_dataframe` allows for a mapping.

In [16]:
all_neuro = pd.read_excel('data/neuro_data.xlsx',
                          na_values=['na', '-', 'nd']).groupby(['VisitDate', 'PatientID']).first()
all_neuro.replace({'Race': {2: 'AA',
                            1: 'white',
                            3: 'asian'},
                   'Sex': {1: 'male', 2: 'female'}},
                  inplace=True)
all_neuro.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Visit,Age,Sex,Race,Race_specify,Education,EngFluency,SpanFluency,MMSE,DigitBackwardSENAS,...,Professional,Homeowner,Householdsize,Earnings.Indiv,Income.Indiv,Earnings.House,Income.House,Income.House.Weighted1,Income.House.Weighted2,Effort
VisitDate,PatientID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2013-09-24,A0091,1.0,36,male,AA,,16,3.0,0.0,27.0,6.0,...,,,,,,,,,,1.0
2013-09-24,A0107,1.0,49,female,AA,,12,3.0,0.0,27.0,3.0,...,,,,,,,,,,
2013-09-24,A0138,1.0,53,female,AA,,11,3.0,0.0,,,...,,,,,,,,,,1.0
2013-10-03,A0165,1.0,62,male,AA,,12,3.0,0.0,29.0,5.0,...,,,,,,,,,,
2013-10-08,A0127,1.0,62,male,AA,,16,3.0,0.0,28.0,2.0,...,,,,,,,,,,


In [10]:
bvmt_calc.process_dataframe(all_neuro.dropna(subset = ['BVMTtrial1', 'BVMTdelay']).head(),
                            mapping = {'BVMTtrial1': 'trial1',
                                       'BVMTtrial2': 'trial2',
                                       'BVMTtrial3': 'trial3',
                                       'BVMTdelay': 'delay',
                                       'BVMThits': 'hits',
                                       'BVMTfalsepos': 'false_pos'})

Unnamed: 0_level_0,Unnamed: 1_level_0,delay,false_pos,hits,trial1,trial2,trial3,immediate,retention_denom,retention,recognition
VisitDate,PatientID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-10-03,A0165,6.0,0.0,5.0,6.0,6.0,7.0,19.0,7.0,0.857143,5.0
2013-10-08,A0127,6.0,0.0,6.0,5.0,6.0,9.0,20.0,9.0,0.666667,6.0
2013-10-08,A0164,7.0,2.0,6.0,5.0,6.0,7.0,18.0,7.0,1.0,4.0
2013-10-10,A0421,7.0,0.0,6.0,6.0,8.0,9.0,23.0,9.0,0.777778,6.0
2013-10-15,A0067,4.0,1.0,4.0,2.0,4.0,4.0,10.0,4.0,1.0,3.0


Awesome, a generic method for aggregating tests.
Can we then follow this with another normalization scheme.

`TestCalculator`s can be added together to concatenate their operations.
This allows for a modular design of tests.
Since the BVMT test is common, but there are many normalization schemes, we can use the calculator multiple times.

Let's load in the `heaton` norms for BVMT test.

In [11]:
path = 'data/norms/from_kate/heaton_bvmt.yaml'
heaton_bvmt_calc = TestCalculator.from_config(yaml.full_load(open(path)))

heaton_bvmt_calc.explain({'age': 22,
                          'immediate': 30,
                          'retention': 0.91,
                          'delay': 11,
                          'recognition': 2})

Taking: age:22, delay:11, immediate:30, recognition:2, retention:0.91
heaton_immediate: Matched (20 <= age) & (age <= 23), Expecting 28.44 +- 4.38, Observed: 30.0
heaton_retention: Matched (20 <= age) & (age <= 23), Expecting 0.9493 +- 0.0726, Observed: 0.91
heaton_delay: Matched (20 <= age) & (age <= 23), Expecting 10.68 +- 1.41, Observed: 11.0
heaton_recognition: Matched (20 <= age) & (age <= 23), Expecting 5.92 +- 0.26, Observed: 2.0
Resulting in: heaton_delay:0.2269503546099293, heaton_immediate:0.35616438356164354, heaton_recognition:-15.076923076923077, heaton_retention:-0.5413223140495869


We can just `add` them!

In [12]:
full_calc = bvmt_calc + heaton_bvmt_calc
full_calc.process_dataframe(all_neuro.dropna(subset = ['BVMTtrial1', 'BVMTdelay']).head(),
                            mapping = {'BVMTtrial1': 'trial1',
                                       'BVMTtrial2': 'trial2',
                                       'BVMTtrial3': 'trial3',
                                       'BVMTdelay': 'delay',
                                       'BVMThits': 'hits',
                                       'BVMTfalsepos': 'false_pos',
                                       'Age': 'age'})


Unnamed: 0_level_0,Unnamed: 1_level_0,age,delay,false_pos,hits,trial1,trial2,trial3,immediate,retention_denom,retention,recognition,heaton_immediate,heaton_retention,heaton_delay,heaton_recognition
VisitDate,PatientID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-10-03,A0165,62.0,6.0,0.0,5.0,6.0,6.0,7.0,19.0,7.0,0.857143,5.0,-0.679035,-0.917898,-1.215962,-1.954545
2013-10-08,A0127,62.0,6.0,0.0,6.0,5.0,6.0,9.0,20.0,9.0,0.666667,6.0,-0.493506,-3.10728,-1.215962,0.318182
2013-10-08,A0164,56.0,7.0,2.0,6.0,5.0,6.0,7.0,18.0,7.0,1.0,4.0,-0.998106,0.711744,-0.882927,-2.948276
2013-10-10,A0421,63.0,7.0,0.0,6.0,6.0,8.0,9.0,23.0,9.0,0.777778,6.0,0.172727,-1.010536,-0.622727,0.166667
2013-10-15,A0067,47.0,4.0,1.0,4.0,2.0,4.0,4.0,10.0,4.0,1.0,3.0,-2.856016,0.776084,-2.763158,-8.142857


The `norman` set regression norms have also been created in the same format.


In [15]:
path = 'data/norms/norman/norman_scaling.yaml'
norman_scale = TestCalculator.from_config(yaml.full_load(open(path)))

DATA['gender'] = 'male'
DATA['race'] = 'AA'

norman_scaled_calc = bvmt_calc + norman_scale
norman_scaled_calc.explain(DATA)



Taking: delay:8, false_pos:2, gender:male, hits:6, race:AA, trial1:5, trial2:6, trial3:7
Used Equation: trial1+trial2+trial3 = 18 = immediate
Aggregation: max [trial2, trial3]  = 7
Used Equation: delay/retention_denom = 1.1428571428571428 = retention
Clipped retention to [0, 1]
Used Equation: hits-false_pos = 4 = recognition
gender:male -> norman_gender:0
race:AA -> norman_race:1
delay matched 8, scaled to 7
immediate matched 16, scaled to 6
Resulting in: delay_scaled:7, immediate:18, immediate_scaled:6, norman_gender:0, norman_race:1, recognition:4, retention:1.0, retention_denom:7


In [14]:
reg_calc = TestCalculator.from_config(yaml.full_load(open('data/norms/norman/norman_bvmt_regnorm.yaml')))

full_norman = norman_scaled_calc + reg_calc
DATA['education'] = 6
DATA['age'] = 32

full_norman.explain(DATA)

#reg_calc.operations[0].regressions[1]['filter']

Taking: age:32, delay:8, education:6, false_pos:2, gender:male, hits:6, race:AA, trial1:5, trial2:6, trial3:7
Used Equation: trial1+trial2+trial3 = 18 = immediate
Aggregation: max [trial2, trial3]  = 7
Used Equation: delay/retention_denom = 1.1428571428571428 = retention
Clipped retention to [0, 1]
Used Equation: hits-false_pos = 4 = recognition
gender:male -> norman_gender:0
race:AA -> norman_race:1
delay matched 8, scaled to 7
immediate matched 16, scaled to 6
Matched (norman_race == 1) & ((age >= 18) & (age <= 66)), applied ((immediate_scaled-(0.2834*(education-13.86)+(-0.1125)*(age-40.63)+1.0394*norman_gender + 8.0679))/2.5701)*10 + 50 = -0.31564958561923645
Matched (norman_race == 1) & ((age >= 18) & (age <= 66)), applied ((delay_scaled-(0.2267*(education-13.86) + (-0.1262)*(age-40.63) + 0.8593*norman_gender + 7.691))/2.5197)*10 + 50 = 0.0006969083621065408
Resulting in: delay_scaled:7, immediate:18, immediate_scaled:6, norman_delay:0.0006969083621065408, norman_gender:0, norman_i

And then we can do the full collection across the whole dataset easily.

In [22]:
full_norman_heaton = bvmt_calc + heaton_bvmt_calc + norman_scale + reg_calc


processed_data = full_norman_heaton.process_dataframe(all_neuro.dropna(subset = ['BVMTtrial1', 'BVMTdelay']),
                            mapping = {'BVMTtrial1': 'trial1',
                                       'BVMTtrial2': 'trial2',
                                       'BVMTtrial3': 'trial3',
                                       'BVMTdelay': 'delay',
                                       'BVMThits': 'hits',
                                       'BVMTfalsepos': 'false_pos',
                                       'Sex': 'gender',
                                       'Race': 'race',
                                       'Education': 'education',
                                       'Age': 'age'})

processed_data[['heaton_immediate', 'heaton_retention', 'heaton_delay', 'heaton_recognition',
                'norman_immediate', 'norman_delay']].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,heaton_immediate,heaton_retention,heaton_delay,heaton_recognition,norman_immediate,norman_delay
VisitDate,PatientID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-10-03,A0165,-0.679035,-0.917898,-1.215962,-1.954545,0.72501,0.169685
2013-10-08,A0127,-0.493506,-3.10728,-1.215962,0.318182,0.283938,-0.190199
2013-10-08,A0164,-0.998106,0.711744,-0.882927,-2.948276,-0.147251,0.086104
2013-10-10,A0421,0.172727,-1.010536,-0.622727,0.166667,1.157873,0.616643
2013-10-15,A0067,-2.856016,0.776084,-2.763158,-8.142857,-0.878313,-0.798525


Awesome, pretty easy and only a few loading commands to do a whole analysis.
