In [9]:
import os
import sys
import re
import time
import random
import warnings
import collections
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [17]:
os.getcwd()

'/Users/bp/workspace/cb/data-analytics/notebooks'

In [10]:
member_ids_query = f"SELECT id FROM cb.members m where m.mco_id = %(mco_id)s"

member_claims_query = f"""
  WITH
      encounter_level   AS ( SELECT DISTINCT
                                 member_id
                               , date_from
                               , array_agg(distinct c.id) claim_ids
                             FROM
                                 cb.claims c
                                 LEFT JOIN ref.place_of_services pos ON pos.id = c.place_of_service_id
                                 LEFT JOIN ref.service_types st ON st.id = c.service_type_id
                             WHERE
                                   c.mco_id = %(mco_id)s
                               and c.member_id = %(member_id)s
                               AND c.service_type_id NOT IN (12, 13, 17, 18, 10, 15, 16)
                               AND NOT c.is_rx
                             GROUP BY 1,2
                             )
    , lagged_encounters AS ( SELECT
                                 el.*
                               , LAG(date_from) OVER (PARTITION BY member_id ORDER BY date_from) prev_claim_date
                             FROM
                                 encounter_level el )
  SELECT
      le.member_id
    , c.date_from
    , le.date_from - prev_claim_date days_since_last_encounter
    , ARRAY_AGG(cd.diag ORDER BY c.claim_line_id, cd.diag_sequence) FILTER ( WHERE cd.diag IS NOT NULL) icds_by_seq
  FROM
      lagged_encounters le
      JOIN cb.claims c ON c.id = any(le.claim_ids)
      LEFT JOIN cb.claims_diagnosis cd ON c.id = cd.claim_id
  WHERE
       c.mco_id = %(mco_id)s
   and cd.mco_id = %(mco_id)s
   and c.member_id = %(member_id)s
  GROUP BY
      1, 2, 3
  ORDER BY
      1, 2
;
   """ 

In [11]:
def get_days_cat(time_to_last_claim):
    if time_to_last_claim < 0:
        raise "Got a negative time to last claim. should never happen"
    if time_to_last_claim == 0:
        return 'ttlc_0'
    if time_to_last_claim <= 1:
        return 'ttlc_1'
    if time_to_last_claim <= 2:
        return 'ttlc_2'
    if time_to_last_claim <= 5:
        return 'ttlc_5'
    if time_to_last_claim <= 10:
        return 'ttlc_10'
    if time_to_last_claim <= 15:
        return 'ttlc_15'
    if time_to_last_claim <= 30:
        return 'ttlc_30'
    if time_to_last_claim <= 60:
        return 'ttlc_60'
    if time_to_last_claim <= 90:
        return 'ttlc_90'
    if time_to_last_claim <= 180:
        return 'ttlc_180'
    
    return 'ttlc_gt180'

In [12]:
def fetch_and_build_member_data(cur, mco_id, member_id, save_path):
    cur.execute(member_claims_query, {'mco_id': mco_id, 'member_id': member_id})

    i = 1
    s = 'xxbos'
    for member_id, date_from, ttlc, icds_by_seq in cur:
        if ttlc is not None:
            s += ' ' + get_days_cat(ttlc) 
        s += ' ' + ' '.join(dict.fromkeys(icds_by_seq))
        

    file_name = f'{save_path}/{mco_id}_{member_id}.txt'

    with open(file_name, 'w') as f: f.write(s)

In [18]:
def build_language_model_data_for_mco(mco_id):
    conn = cb_utils.get_conn()
    cur = conn.cursor()

    cur.execute(member_ids_query, {'mco_id': mco_id}) 

    save_path = './data/just_icds'
    for m in tqdm([x[0] for x in cur]):
        fetch_and_build_member_data(cur, mco_id, m, save_path)

In [19]:
build_language_model_data_for_mco(2)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29670/29670 [57:38<00:00,  8.58it/s]


In [20]:
build_language_model_data_for_mco(1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7438/7438 [11:23<00:00, 10.87it/s]


In [21]:
build_language_model_data_for_mco(4)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13915/13915 [38:39<00:00,  6.00it/s]


In [22]:
build_language_model_data_for_mco(5)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 22251/22251 [1:27:33<00:00,  4.24it/s]


In [23]:
build_language_model_data_for_mco(6)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5900/5900 [34:35<00:00,  2.84it/s]


In [25]:
build_language_model_data_for_mco(7)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14045/14045 [16:51<00:00, 13.88it/s]


In [27]:
build_language_model_data_for_mco(8)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17679/17679 [28:23<00:00, 10.38it/s]


In [28]:
build_language_model_data_for_mco(9)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2185/2185 [04:44<00:00,  7.69it/s]


In [29]:
build_language_model_data_for_mco(12)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7706/7706 [33:07<00:00,  3.88it/s]


### Modelling

In [24]:
from fastai.text.all import *

ModuleNotFoundError: No module named 'fastai'

In [80]:
path = Path('./data')

In [81]:
files = get_text_files(path, folders = ['just_icds'])

In [82]:
len(files)

244687

In [85]:
txt = files[10].open().read(); txt[:75]

'xxbos ttlc_0 j40 j441 j40 j441 ttlc_90 j40 j441 ttlc_60 j40 j441 ttlc_30 j4'

In [96]:
txts = L(o.open().read().split(' ') for o in files[:2000])

In [97]:
num = Numericalize()
num.setup(txts)

In [98]:
coll_repr(num.vocab, 20)

"(#5944) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','ttlc_0','ttlc_1','i10','r5381','n186','e119','j449','r6889','r53','d631','n2581'...]"

In [107]:
nums = txts.map(num)

In [108]:
' '.join(num.vocab[o] for o in nums[0])

'xxbos ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311

In [109]:
dl = LMDataLoader(nums)

In [110]:
x,y = first(dl)

In [112]:
x.shape, y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [113]:
' '.join(num.vocab[o] for o in x[0][:20])

'xxbos ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630'

In [114]:
' '.join(num.vocab[o] for o in y[0][:20])

'ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634 d631 ttlc_0 i1311 n185 z905 r630 r634'

In [120]:
dl?

[0;31mType:[0m        LMDataLoader
[0;31mString form:[0m <fastai.text.data.LMDataLoader object at 0x1583917c0>
[0;31mLength:[0m      463
[0;31mFile:[0m        ~/.local/share/virtualenvs/data-analytics-1yVNxZKx/lib/python3.8/site-packages/fastai/text/data.py
[0;31mDocstring:[0m   A `DataLoader` suitable for language modeling


In [121]:
dls = TextDataLoaders.from_folder(path / 'just_icds', valid_pct=.1, seed=None, is_lm=True, tok_tfm=None, seq_len=72, backwards=False, bs=64, val_bs=None, shuffle=True, device=None)


In [124]:
learn = language_model_learner(
    dls, AWD_LSTM, drop_mult=0.3, 
    pretrained=False,
    metrics=[accuracy, Perplexity()]).to_fp16()



In [125]:
learn.fit_one_cycle(1, 

[0;31mSignature:[0m
[0mlearn[0m[0;34m.[0m[0mfit_one_cycle[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_epoch[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlr_max[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdiv[0m[0;34m=[0m[0;36m25.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdiv_final[0m[0;34m=[0m[0;36m100000.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpct_start[0m[0;34m=[0m[0;36m0.25[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwd[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmoms[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcbs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreset_opt[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Fit `self.model` for `n_epoch` using the 1cycle policy.
[0;31mFile:[0m      ~/.local/share/virtualenvs/data-analytics-1yVNxZKx/lib/python3

In [130]:
learn.lr_find()