In [1]:
import os
import sys
import re
import time
import random
import warnings
import collections
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500
print('pwd', os.getcwd())

%load_ext autoreload
%autoreload 2

pwd /Users/bp/workspace/cb/data-analytics/notebooks


### Try combining claim level chunks that take the form
ttlc claim_service_type cpt_1 dx1 dx2 dx3 cpt_2 dx4 dx5
# texts were too big
Need to try limiting to a single year

In [4]:
member_ids_query = f"SELECT id FROM cb.members m where m.mco_id = %(mco_id)s"

member_claims_query = f"""
  WITH
      encounter_level   AS ( SELECT
                                 member_id
                               , mco_id
                               , date_from
                             FROM
                                 cb.claims c
                                 LEFT JOIN ref.service_types st ON st.id = c.service_type_id
                             WHERE
                                   c.mco_id = %(mco_id)s
                               and c.member_id = %(member_id)s
                               AND c.service_type_id NOT IN (12, 13, 17, 18, 10, 15, 16)
                               AND NOT c.is_rx
                             GROUP BY 1,2,3
                             )
    , lagged_encounters AS ( SELECT
                                 el.*
                               , LAG(date_from) OVER (PARTITION BY member_id ORDER BY date_from) prev_claim_date
                             FROM
                                 encounter_level el )

  SELECT
    le.member_id
  , c.date_from
  , date_part('year', c.date_from) yr
  , c.source_claim_id
  , c.claim_line_id
  , c.procedure_code
  , c.service_type_id
  , le.date_from - prev_claim_date                                                                    days_since_last_encounter
  , ARRAY_AGG(DISTINCT cd.diag ORDER BY cd.diag) FILTER ( WHERE cd.diag IS NOT NULL) icds_by_alpha
FROM
    lagged_encounters le
    JOIN cb.claims c ON c.date_from = le.date_from and c.member_id = le.member_id and c.mco_id = le.mco_id
    LEFT JOIN cb.claims_diagnosis cd ON c.id = cd.claim_id
WHERE
       c.mco_id = %(mco_id)s
   and cd.mco_id = %(mco_id)s
   and c.member_id = %(member_id)s
   and not c.is_rx -- remove for ndc language model
GROUP BY
    1, 2, 3, 4, 5, 6, 7, 8
ORDER BY
    1, 2,3,4,5;
   """ 

In [5]:
def get_days_cat(time_to_last_claim):
    if time_to_last_claim < 0:
        raise "Got a negative time to last claim. should never happen"
    if time_to_last_claim == 0:
        return 'ttlc_0'
    if time_to_last_claim <= 1:
        return 'ttlc_1'
    if time_to_last_claim <= 2:
        return 'ttlc_2'
    if time_to_last_claim <= 5:
        return 'ttlc_5'
    if time_to_last_claim <= 10:
        return 'ttlc_10'
    if time_to_last_claim <= 15:
        return 'ttlc_15'
    if time_to_last_claim <= 30:
        return 'ttlc_30'
    if time_to_last_claim <= 60:
        return 'ttlc_60'
    if time_to_last_claim <= 90:
        return 'ttlc_90'
    if time_to_last_claim <= 180:
        return 'ttlc_180'
    
    return 'ttlc_gt180'

In [11]:
def write_member_file(save_path, yr, mco_id, member_id, body):
    file_name = f'{save_path}/{mco_id}_{yr}_{member_id}.txt'

    with open(file_name, 'w') as f: f.write(body)

In [16]:
def fetch_and_build_member_data(cur, mco_id, member_id, save_path):
    cur.execute(member_claims_query, {'mco_id': mco_id, 'member_id': member_id})

    prev_date_from = None
    prev_yr = None
    s = ''
    for member_id, date_from, yr, source_claim_id, claim_line_id, cpt,st_id,ttlc,icds_by_alpha in cur:
        if prev_yr is None:
            prev_yr = yr
            
        if yr != prev_yr:
            write_member_file(save_path, prev_yr, mco_id, member_id, s)
            prev_yr = yr
            s = ''
            
        if date_from != prev_date_from and ttlc is not None:
            s += ' ' + get_days_cat(ttlc) 
            prev_date_from = date_from
        
        # add the service type once per claim
        if claim_line_id == 1 and st_id is not None:
            s += ' st_' + str(st_id)
            
        if cpt is not None:
            s += ' cpt_' + cpt 
            
        s += ' ' + ' '.join('icd_' + icd for icd in dict.fromkeys(icds_by_alpha))
        
    if s != '':
        write_member_file(save_path, prev_yr, mco_id, member_id, s)

In [17]:
def build_language_model_data_for_mco(mco_id):
    conn = cb_utils.get_conn(cb_utils.get_secrets()['db_connection_string_ro'])
    # conn = cb_utils.get_conn()
    cur = conn.cursor()

    cur.execute(member_ids_query, {'mco_id': mco_id}) 

    save_path = './data/service_type_cpt_icds'
    for m in tqdm([x[0] for x in cur]):
        fetch_and_build_member_data(cur, mco_id, m, save_path)
    cur.close()
    conn.close()

In [18]:
# build_language_model_data_for_mco(1)

In [19]:
# for mco_id in [8,9,12]:
# for mco_id in [5,6,7,8,9,12]:
for mco_id in [2,4,5,6,7,8,9,12]:
    print('Pulling mco: ', mco_id)
    build_language_model_data_for_mco(mco_id)

Pulling mco:  2


100%|█████████████████████████████████████| 29753/29753 [54:58<00:00,  9.02it/s]


Pulling mco:  4


100%|█████████████████████████████████████| 14519/14519 [31:00<00:00,  7.81it/s]


Pulling mco:  5


100%|███████████████████████████████████| 22469/22469 [1:07:44<00:00,  5.53it/s]


Pulling mco:  6


100%|███████████████████████████████████████| 7929/7929 [22:00<00:00,  6.00it/s]


Pulling mco:  7


100%|█████████████████████████████████████| 14045/14045 [22:56<00:00, 10.20it/s]


Pulling mco:  8


100%|█████████████████████████████████████| 17679/17679 [33:20<00:00,  8.84it/s]


Pulling mco:  9


100%|███████████████████████████████████████| 2185/2185 [05:54<00:00,  6.16it/s]


Pulling mco:  12


100%|███████████████████████████████████████| 7851/7851 [21:36<00:00,  6.05it/s]
