In [6]:
import os
import sys
import re
import time
import random
import warnings
import collections
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500
print('pwd', os.getcwd())

%load_ext autoreload
%autoreload 2

pwd /Users/bp/workspace/cb/data-analytics/notebooks


### Try combining claim level chunks that take the form
ttlc claim_service_type cpt_1 dx1 dx2 dx3 cpt_2 dx4 dx5

In [20]:
member_ids_query = f"SELECT id FROM cb.members m where m.mco_id = %(mco_id)s"

member_claims_query = f"""
  WITH
      encounter_level   AS ( SELECT DISTINCT
                                    c.member_id
                                  , c.date_from
                                  , c.rx_ndc_code
                             FROM
                                 cb.claims c
                             WHERE
                                   c.mco_id = %(mco_id)s
                               and c.member_id = %(member_id)s
                               AND c.is_rx
                               and c.rx_ndc_code is not null
                             )
    , lagged_encounters AS ( SELECT
                                 el.*
                               , LAG(date_from) OVER (PARTITION BY member_id ORDER BY date_from) prev_claim_date
                             FROM
                                 encounter_level el )
                                 
  SELECT distinct
    le.member_id
  , le.date_from
  , le.rx_ndc_code
  , le.date_from - prev_claim_date                                                                    days_since_last_encounter
FROM
    lagged_encounters le
ORDER BY
    1,2,3
;
   """ 

In [21]:
def get_days_cat(time_to_last_claim):
    if time_to_last_claim < 0:
        raise "Got a negative time to last claim. should never happen"
    if time_to_last_claim == 0:
        return 'ttlc_0'
    if time_to_last_claim <= 1:
        return 'ttlc_1'
    if time_to_last_claim <= 2:
        return 'ttlc_2'
    if time_to_last_claim <= 5:
        return 'ttlc_5'
    if time_to_last_claim <= 10:
        return 'ttlc_10'
    if time_to_last_claim <= 15:
        return 'ttlc_15'
    if time_to_last_claim <= 30:
        return 'ttlc_30'
    if time_to_last_claim <= 60:
        return 'ttlc_60'
    if time_to_last_claim <= 90:
        return 'ttlc_90'
    if time_to_last_claim <= 180:
        return 'ttlc_180'
    
    return 'ttlc_gt180'

In [22]:
def fetch_and_build_member_data(cur, mco_id, member_id, save_path):
    cur.execute(member_claims_query, {'mco_id': mco_id, 'member_id': member_id})

    prev_date_from = None
    s = ''
    for member_id, date_from, ndc, ttlc in cur:
        if date_from != prev_date_from and ttlc is not None:
            s += ' ' + get_days_cat(ttlc) 
            prev_date_from = date_from
        
        s += ' ' + ndc 
            
    file_name = f'{save_path}/{mco_id}_{member_id}.txt'

    with open(file_name, 'w') as f: f.write(s)

In [23]:
def build_language_model_data_for_mco(mco_id):
    conn = cb_utils.get_conn(cb_utils.get_secrets()['db_connection_string_ro'])
    cur = conn.cursor()

    cur.execute(member_ids_query, {'mco_id': mco_id}) 

    save_path = './data/just_ndcs'
    for m in tqdm([x[0] for x in cur]):
        fetch_and_build_member_data(cur, mco_id, m, save_path)

In [24]:
build_language_model_data_for_mco(1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7438/7438 [06:11<00:00, 20.00it/s]


In [25]:
for mco_id in [2,4,5,6,7,8,9,12]:
    print('Pulling mco: ', mco_id)
    build_language_model_data_for_mco(mco_id)

Pulling mco:  2


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29753/29753 [26:02<00:00, 19.04it/s]


Pulling mco:  4


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14519/14519 [13:47<00:00, 17.55it/s]


Pulling mco:  5


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22251/22251 [29:34<00:00, 12.54it/s]


Pulling mco:  6


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5969/5969 [07:11<00:00, 13.84it/s]


Pulling mco:  7


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14045/14045 [13:18<00:00, 17.58it/s]


Pulling mco:  8


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17679/17679 [16:33<00:00, 17.79it/s]


Pulling mco:  9


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2185/2185 [02:19<00:00, 15.63it/s]


Pulling mco:  12


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7851/7851 [09:13<00:00, 14.19it/s]


In [21]:
build_language_model_data_for_mco(2)

In [None]:
build_language_model_data_for_mco(4)

In [None]:
build_language_model_data_for_mco(5)

In [None]:
build_language_model_data_for_mco(6)

In [None]:
build_language_model_data_for_mco(7)

In [None]:
build_language_model_data_for_mco(8)

In [None]:
build_language_model_data_for_mco(9)

In [None]:
build_language_model_data_for_mco(12)