In [1]:
%load_ext autoreload
%autoreload 2

# HDRUK Acute Admissions Feature Engineering

This is an example using code available at https://github.com/lthtr-dst/hdruk_avoidable_admissions

Under development and there are bound to be breaking changes and bugs. 

Please see commit history, ensure you have the latest clone of the repo, have updated your conda environment using the environment.yaml file.

Please raise an issue if you find a bug or have a question.

## Admitted Care

In [2]:
import numpy as np
import pandas as pd

import avoidable_admissions as aa

In [3]:
df = pd.read_pickle("../data/processed/admitted_care.pickle")

In [4]:
df.columns

Index(['patient_id', 'gender', 'ethnos', 'procodet', 'sitetret',
       'townsend_score_decile', 'admimeth', 'admisorc', 'admidate', 'admitime',
       'disdest', 'dismeth', 'length_of_stay', 'epiorder', 'admiage',
       'diag_01', 'diag_02', 'diag_03', 'diag_04', 'diag_05', 'diag_06',
       'diag_07', 'diag_08', 'diag_09', 'diag_10', 'diag_11', 'diag_12',
       'diag_13', 'diag_14', 'diag_15', 'diag_16', 'opertn_01', 'opdate_01',
       'opertn_02', 'opertn_03', 'opertn_04', 'opertn_05', 'opertn_06',
       'opertn_07', 'opertn_08', 'opertn_09', 'opertn_10', 'opertn_11',
       'opertn_12', 'opertn_13', 'opertn_14', 'opertn_15', 'opertn_16',
       'opdate_02', 'opdate_03', 'opdate_04', 'opdate_05', 'opdate_06',
       'opdate_07', 'opdate_08', 'opdate_09', 'opdate_10', 'opdate_11',
       'opdate_12', 'opdate_13', 'opdate_14', 'opdate_15', 'opdate_16',
       'visit_id'],
      dtype='object')

In [5]:
dfa = df.copy()

## First Validation

In [6]:
good, bad = aa.data.validate.validate_admitted_care_data(dfa)

Schema AdmittedCareEpisodeSchema: A total of 1 schema errors were found.

Error Counts
------------
- schema_component_check: 1

Schema Error Summary
--------------------
                                                                                failure_cases  n_failure_cases
schema_context column   check                                                                                 
Column         admitime str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5][0-9]'))         [nan]                1

Usage Tip
---------

Directly inspect all errors by catching the exception:

```
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
```



In [7]:
print(f"""
Total number of rows in input data   : {dfa.shape[0]}
Number of rows that passed validation: {good.shape[0]}
Number of rows that failed validation: {bad.shape[0]}
""")


Total number of rows in input data   : 39168
Number of rows that passed validation: 39156
Number of rows that failed validation: 12



In [8]:
bad[["schema_context", "column", "check", "check_number", "failure_case", "index"]]

Unnamed: 0,schema_context,column,check,check_number,failure_case,index
0,Column,admitime,str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5...,0,,3487
1,Column,admitime,str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5...,0,,6714
2,Column,admitime,str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5...,0,,7099
3,Column,admitime,str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5...,0,,8065
4,Column,admitime,str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5...,0,,8103
5,Column,admitime,str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5...,0,,9622
6,Column,admitime,str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5...,0,,12708
7,Column,admitime,str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5...,0,,13417
8,Column,admitime,str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5...,0,,16933
9,Column,admitime,str_matches(re.compile('2[0-3]|[01]?[0-9]:[0-5...,0,,17772


## Feature Engineering

In [9]:
dfa_features = aa.features.build_features.build_admitted_care_features(good.copy())

## Second validation

In [10]:
good, bad = aa.data.validate.validate_admitted_care_features(dfa_features)
print(f"""
Total number of rows in input data   : {dfa_features.shape[0]}
Number of rows that passed validation: {good.shape[0]}
Number of rows that failed validation: {bad.shape[0]}
""")

Schema AdmittedCareEpisodeSchema: A total of 1 schema errors were found.

Error Counts
------------
- schema_component_check: 1

Schema Error Summary
--------------------
                                                                                                 failure_cases  n_failure_cases
schema_context column       check                                                                                              
Column         admisorc_cat isin({'Care Home', 'Residence', 'Penal', 'Unknown', 'Medical care'})      [56, 40]                2

Usage Tip
---------

Directly inspect all errors by catching the exception:

```
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
```


Total number of rows in input data   : 39156
Number of rows that passed validation: 39133
Number of rows that failed validation: 23



In [11]:
bad[["schema_context", "column", "check", "check_number", "failure_case", "index"]]

Unnamed: 0,schema_context,column,check,check_number,failure_case,index
0,Column,admisorc_cat,"isin({'Care Home', 'Residence', 'Penal', 'Unkn...",0,56,2202
12,Column,admisorc_cat,"isin({'Care Home', 'Residence', 'Penal', 'Unkn...",0,56,24719
21,Column,admisorc_cat,"isin({'Care Home', 'Residence', 'Penal', 'Unkn...",0,56,35587
20,Column,admisorc_cat,"isin({'Care Home', 'Residence', 'Penal', 'Unkn...",0,40,33618
19,Column,admisorc_cat,"isin({'Care Home', 'Residence', 'Penal', 'Unkn...",0,40,32706
18,Column,admisorc_cat,"isin({'Care Home', 'Residence', 'Penal', 'Unkn...",0,40,32444
17,Column,admisorc_cat,"isin({'Care Home', 'Residence', 'Penal', 'Unkn...",0,56,31535
16,Column,admisorc_cat,"isin({'Care Home', 'Residence', 'Penal', 'Unkn...",0,56,31422
15,Column,admisorc_cat,"isin({'Care Home', 'Residence', 'Penal', 'Unkn...",0,56,30519
14,Column,admisorc_cat,"isin({'Care Home', 'Residence', 'Penal', 'Unkn...",0,56,28639
