## Doctor Right


#### Import Library

In [0]:
%pip install keras optree tensorflow

In [0]:
import sys
sys.path.append("../modules")
from eda import EDAAnalyzer
from spark_session import SparkManager
from feature_engineering import FeatureEngineer

In [0]:
# Load autoreload extension
%load_ext autoreload
%autoreload 2

#### Constants and config

In [0]:
mx_submits_path = "../data_sample/mx_submits.parquet/"
mx_submits_line_path = "../data_sample/mx_submitsline.parquet/"
cohort_key="767ef4cac69e8a0c77384f6e1414364b"

sample_patient_id = "8aad41f612a7095449888c8050abaeb05fdee65643caa3033542610421d8bd1daaa2c4ce1757401003a1bbcd60948a7aa13eba507a676dea80e0cf76b77dbc95"

features_cols = ['secondary_payer_state',
'billing_provider_address_precision',
'billing_provider_address_region',
'claim_filing_indicator_pay_type',
'claim_institutional_or_professional',
'facility_provider_address_precision',
'facility_provider_address_region',
'inpatient_discharge_status_code',
'organization_npi_type_code',
'organization_sourced_from',
'organization_taxonomy_group',
'patient_gender',
'patient_id',
'patient_location_residential_region',
'primary_payer_pay_type',
'primary_payer_plan_type',
'principal_diagnosis_body_part',
'principal_diagnosis_category',
'principal_diagnosis_code_set',
'principal_procedure_code_set',
'referring_provider_taxonomy_group',
'rendering_provider_npi_type_code',
'rendering_provider_taxonomy_group',
'secondary_payer_claim_filing_indicator_code',
'secondary_payer_pay_type',
# 'secondary_payer_plan_type',
'claim_all_diagnosis_codes',
'claim_total_charge_amount',
'previous_diagnosis_ohe',
# 'claim_all_diagnosis_ohe'
]
exclude_cols = ['patient_id']
most_repeated_diagnosis_list = [] 

### Spark Session

In [0]:
mx_submits_spark_manager = SparkManager(cohort_key=cohort_key)
# mx_submits_line_spark_manager = SparkManager(mx_submits_line_path)

### MX SUBMITS

#### EDA

In [0]:
mx_submits_eda = EDAAnalyzer(mx_submits_spark_manager)

In [0]:
mx_submits_eda.display_head()

In [0]:
mx_submits_eda.display_shape()

#### Type conversion

In [0]:
mx_submits_eda.convert_columns_to_float(["claim_total_charge_amount"])

In [0]:
column_info_submits = mx_submits_eda.display_column_info()
column_info_submits.to_csv("../output/column_info_submits.csv")
column_info_submits

In [0]:
column_info_submits[column_info_submits["Column Name"]=="claim_total_charge_amount"]

In [0]:
mx_submits_eda.plot_percentile_based_cutoff("claim_all_diagnosis_codes",90,cutoff_length=5)

In [0]:
mx_submits_claim_all_diagnosis_codes_repeat_count = mx_submits_eda.get_top_n_repeated_values("claim_all_diagnosis_codes")
mx_submits_claim_all_diagnosis_codes_repeat_count

In [0]:
most_repeated_diagnosis = mx_submits_claim_all_diagnosis_codes_repeat_count[
    (mx_submits_claim_all_diagnosis_codes_repeat_count["diagnosis_code_length"] == 5) & 
    (mx_submits_claim_all_diagnosis_codes_repeat_count["count"] > 461)
]
most_repeated_diagnosis_list = most_repeated_diagnosis['diagnosis_code'].tolist()
most_repeated_diagnosis

In [0]:
most_repeated_diagnosis = mx_submits_claim_all_diagnosis_codes_repeat_count[mx_submits_claim_all_diagnosis_codes_repeat_count["count"]>=531]
most_repeated_diagnosis

 - (ICD10, Z0001, 1): Encounter for general adult medical examination with abnormal findings
 - (ICD10, F4323, 1): Adjustment disorder with depressed mood
 - (ICD10, Z452, 1)	: Pneumonia, unspecified organism
 - (ICD10, J441, 1)	: Chronic obstructive pulmonary disease with acute exacerbation

In [0]:
mx_submits_eda.get_top_n_repeated_values("claim_all_diagnosis_codes",5)

In [0]:
type_of_bill_facility_description_eda = mx_submits_eda.get_fill_counts_for_unique_values("type_of_bill_facility_description")
type_of_bill_facility_description_eda

In [0]:
type_of_bill_facility_description_eda[type_of_bill_facility_description_eda["Column"]=='facility_provider_address_city']

### MX SUBMITS LINE

#### EDA

In [0]:
mx_submits_line_eda = EDAAnalyzer(mx_submits_line_spark_manager)

In [0]:
mx_submits_line_eda.display_head()

In [0]:
mx_submits_line_eda.display_shape()

In [0]:
column_info_submits_line = mx_submits_line_eda.display_column_info()
column_info_submits_line.to_csv("../output/column_info_submits_line.csv")
column_info_submits_line

### Feature Engineering

In [0]:
mx_submits_fe=FeatureEngineer(mx_submits_spark_manager)

In [0]:
mx_submits_fe.display_shape()

In [0]:
mx_submits_fe.transform_claim_all_diagnosis_codes()

In [0]:
mx_submits_fe.add_comorbidities_with_exponential_decay_sparse_vector()

In [0]:
mx_submits_fe.add_continuous_visit_years()
mx_submits_fe.display_top_rows_as_pandas("continuous_visit_years")

In [0]:
# mx_submits_fe.remove_diagnosis_codes(most_repeated_diagnosis_list)

In [0]:
mx_submits_fe.retain_columns(features_cols)

In [0]:
mx_submits_fe.convert_columns_to_float(["claim_total_charge_amount"])
preprocess_data = mx_submits_fe.preprocess_data(exclude_cols=exclude_cols)
preprocess_data

In [0]:
columns_df = mx_submits_fe.get_columns_as_pandas_df()
columns_df['Column Names'].to_list()
feature_cols = mx_submits_fe.get_feature_columns()
feature_cols

In [0]:
# mx_submits_fe.display_head()

In [0]:
# mx_submits_fe.reduce_dataframe_size()
mx_submits_fe.train_autoencoder()

In [0]:
corr_features = mx_submits_fe.plot_correlation_heatmap()
corr_features.to_csv("../output/feature_correlation.csv")

In [0]:
mx_submits_fe.get_sorted_feature_correlations().head(20)

In [0]:
mx_submits_fe.get_sorted_feature_correlations().head(20)

In [0]:
sample_patient_info = mx_submits_fe.get_rows_by_column_value("patient_id",sample_patient_id)
sample_patient_info.to_csv("../output/sample_patient_info.csv")
sample_patient_info

In [0]:
mx_submits_fe.display_shape()

In [0]:
%autoreload 2