In [None]:
import os
import sys
import time
import random
import warnings
import collections
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
path = '/Users/bp/Downloads/all_suspect_logic.xlsx'
icd_suspects = pd.read_excel(path, sheet_name='simple_icd_match', header=0)
cpt_suspects = pd.read_excel(path, sheet_name='simple_cpt', header=0)
ndc_suspect = pd.read_excel(path, sheet_name='drug_level_ndc', header=0)

In [None]:
icd_suspects.head()
# icd_suspects.loc[icd_suspects.melinda_edits != 'REMOVE'].head()

In [None]:
cpt_suspects.head()

In [None]:
ndc_suspect.head(10)

In [None]:
# ndc_suspect.dtypes

In [None]:
ndc_suspect.loc[~ndc_suspect.melinda_flag_to_remove].head()

In [None]:
eng = cb_utils.get_engine(source='msh_analytics')

In [None]:
icd_suspects.loc[icd_suspects.melinda_edits != 'REMOVE'].to_sql('hcc_icd_suspects', eng, schema='ref', if_exists='replace', index=False)

In [None]:
cpt_suspects.to_sql('hcc_cpt_suspects', eng, schema='ref', if_exists='replace', index=False)

In [None]:
ndc_suspect.loc[~ndc_suspect.melinda_flag_to_remove].to_sql('hcc_ndc_suspects', eng, schema='ref', if_exists='replace', index=False)

# Imort External EMR data

In [None]:
def format_icd10(df, col):
    df[col] = df[col].astype(str).str.replace('.', '', regex=False).str.lower().str.strip()
    return df

In [None]:
base_dir = '/Users/bp/msh/external_emr_data/'
file_names = [f for f in os.listdir(base_dir) if '20211020' in f]
file_names

In [None]:
for file_name in file_names:
    emr_data = pd.read_excel(f'{base_dir}/{file_name}', sheet_name='Sheet1', header=0)
#     emr_data.columns = emr_data.columns.str.replace('[^a-zA-Z ]', '').str.replace(' ', '_').str.lower()
    emr_data = emr_data[['golgi_id', 'patient_name_last', 'patient_name_first', 'dob', 'location', 'captured_icd_10', 'date_captured', 'recapture_icd_10', 'date_last_captured', 'suspect_icd_10', 'notes']]
    icd_cols = ['captured_icd_10', 'recapture_icd_10', 'suspect_icd_10']
    for col in icd_cols:
        emr_data = format_icd10(emr_data, col)
    
    table = file_name.replace('.xlsx', '')
    eng = cb_utils.get_engine(source='msh_analytics')
    emr_data.to_sql(f'external_emr_data_{table}', eng, schema='junk', if_exists='replace', method='multi', index=False)

In [None]:
emr_data.head(100)

In [None]:
emr_data.replace({np.nan: None}).head()

In [None]:
file_name = '20211006_medina.xlsx'
emr_data = pd.read_excel(f'{base_dir}/{file_name}', sheet_name='Sheet1', header=0)

In [None]:
emr_data.head()

In [None]:
name_mapping = {
    'Patient Name': 'patient_name',
    'Golgi ID': 'golgi_id', 
    'IT Upload Date': 'it_upload_date',
    'DOB': 'dob',
    'Location': 'location',
    'Insurance': 'insurance',
    'Billed ICD10s 2021': 'billed_icd10_2021',
    'DOS': 'dos',
    'Suspected ICD10': 'suspected_icd10',
    'DOS of Suspected': 'date_of_suspected_icd10',
    'Historical ICD10': 'historical_icd10',
#     'Historical ICD10-before 2021': 'historical_icd10',
}
emr_data = emr_data.rename(columns=name_mapping)
# emr_data.rename(columns=name_mapping)
emr_data.head()

In [None]:
def format_icd10(df, col):
    df[col] = df[col].str.replace('.', '', regex=False).str.lower().str.strip()
    return df

In [None]:
icd_cols = ['suspected_icd10', 'billed_icd10_2021', 'historical_icd10', 'missed_icd10']
for col in icd_cols:
    emr_data = format_icd10(emr_data, col)

In [None]:
emr_data.head()
list(emr_data.columns)

In [None]:
cols = [
 'patient_name',
 'golgi_id',
 'location',
 'insurance',
 'billed_icd10_2021',
 'dos',
 'suspected_icd10',
#  'date_of_suspected_icd10',
 'historical_icd10',
 'missed_icd10',
 'dos_of_missed',
]

In [None]:
eng = cb_utils.get_engine(source='msh_analytics')
emr_data[cols].to_sql('external_emr_data', eng, schema='raw', if_exists='append', method='multi', index=False)

# Import Labs

In [None]:
def rename_lab_cols(labs):
    name_mapping = {
       "creatinine_eGFR_nonafricn_am": "creatinine_egfr_nonafricn_am",
       "creatinine_eGFR_africn_am": "creatinine_egfr_africn_am",
       "hemoglobin_A1c": "hemoglobin_a1c"
    }
    return labs.rename(columns=name_mapping)

In [None]:
base_dir = '/Users/bp/msh/labs/'
file_names = [f for f in os.listdir(base_dir) if '20211020' in f and '~$' not in f]
file_names

In [None]:
for file_name in file_names:
    lab_data = pd.read_excel(f'{base_dir}/{file_name}', sheet_name='next 8 days', header=0)
    lab_data = rename_lab_cols(lab_data)
    lab_data = lab_data.replace({'hiv_ag_ab_with_reflex': {'Non Reactive': 0, 'Reactive': 1}})
    lab_data = pd.melt(lab_data, id_vars=['patient_id', 'patient_first_name', 'patient_last_name', 'date_of_birth', 'date_of_lab_collection'], var_name='lab_type', value_name='lab_value', ignore_index=True)

    table = file_name.replace('.xlsx', '')
    eng = cb_utils.get_engine(source='msh_analytics')
#     lab_data.to_sql(f'hcc_suspecting_lab_data', eng, schema='junk', if_exists='replace', method='multi', index=False)
    lab_data.to_sql(f'hcc_suspecting_lab_data', eng, schema='raw', if_exists='append', method='multi', index=False)

In [None]:
lab_data.dtypes

In [None]:
lab_data.head(100)

In [None]:
lab_data.hiv_ag_ab_with_reflex.value_counts()

# Old

In [None]:
base_dir = '/Users/bp/msh/suspecting_analysis'
ndcs = pd.read_excel(f'{base_dir}/Xwalk_Master_Final_NDC.xlsx', sheet_name='RB NDC updates', header=1)

In [None]:
ndcs.head()

In [None]:
hcc_col = ndcs.columns[0]

ndcs = ndcs.loc[~ndcs[hcc_col].isna()]

In [None]:
ndcs['ICD10s'] = ndcs['ICD10s'].str.split('\n|,')
ndcs = ndcs.explode('ICD10s').query("ICD10s != ''")
ndcs['ICD10s'] = ndcs['ICD10s'].str.replace('.', '', regex=False).str.lower().str.strip()

In [None]:
ndcs.columns =  ['hcc', 'hcc_descr', 'coefficient', 'icd10', 'med', 'ndc', 'non_hcc_codes']
ndcs['med'] = ndcs['med'].str.lower().str.strip()
ndcs = ndcs.assign(ndc_code=ndcs['ndc'].str.replace('-', '', regex=False))

In [None]:
ndcs.head()

In [None]:
ndcs.shape

In [None]:
eng = cb_utils.get_engine()

In [None]:
ndcs.to_sql('ndc_xwalk', eng, schema='junk', if_exists='replace', index=False)

In [None]:
base_dir = '/Users/bp/msh/suspecting_analysis'
ndcs = pd.read_excel(f'{base_dir}/Xwalk_Master_Final_NDC_cleanedup.xlsx', sheet_name='verified_ndc', header=0, converters={'ndc': lambda x: str(x)})

In [None]:
ndcs.head()

In [None]:
ndcs.to_sql('ndc_xwalk_clean', eng, schema='junk', if_exists='replace', index=False)

In [None]:
query = f"SELECT dtw.* FROM cb.ds_tall_wide dtw JOIN cb.mcos m ON m.id = {mco_id} AND m.ds_batch_id = dtw.ds_batch_id;"
ds_tall_wide = cb_utils.sql_query_to_df(query, use_cache=use_cache)
ds_tall_wide = ds_tall_wide.drop(columns=['created_at'])
ds_tall_wide.transplant_ddos = [0 if r is None else r for r in ds_tall_wide.transplant_ddos]

In [None]:
query = f"SELECT v.* FROM cb.ds_vaps v JOIN cb.mcos m on m.id = {mco_id} and m.ds_batch_id = v.ds_batch_id;"
# query = "SELECT * FROM cb.ds_vaps v WHERE v.ds_batch_id = 18;"
vap = cb_utils.sql_query_to_df(query, use_cache=use_cache)