# Doctor Right - Model building

In [1]:
import sys
sys.path.append("../modules")
from eda import EDAAnalyzer
from spark_session import SparkManager
from feature_engineering import FeatureEngineer
from ml_developer import XGBoostModelBuilder
from ml_developer import MLPModelBuilder
from ml_developer import KMeansBuilder

In [2]:
# Load autoreload extension
%load_ext autoreload
%autoreload 2

#### Constants and config

In [3]:
# mx_submits_path = "../data_sample/mx_submits_all/"
mx_submits_path = "../data_sample/mx_submits.parquet/"
mx_submits_line_path = "../data_sample/mx_submitsline.parquet/"
cohort_key="767ef4cac69e8a0c77384f6e1414364b"

sample_patient_id = "8aad41f612a7095449888c8050abaeb05fdee65643caa3033542610421d8bd1daaa2c4ce1757401003a1bbcd60948a7aa13eba507a676dea80e0cf76b77dbc95"
features_cols = [
'facility_provider_address_region',
'patient_gender',
'principal_diagnosis_body_part',
'principal_diagnosis_category',
'claim_all_diagnosis_codes',
'previous_diagnosis_ohe']
label_column = 'claim_total_charge_amount'
exclude_cols = ['patient_id']
most_repeated_diagnosis_list = [] 

In [4]:
mx_submits_spark_manager = SparkManager(mx_submits_path)

24/10/22 00:25:18 WARN Utils: Your hostname, Sureshs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.20.9.214 instead (on interface en0)
24/10/22 00:25:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/22 00:25:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

### Feature Engineering

In [5]:
mx_submits_fe=FeatureEngineer(mx_submits_spark_manager)

In [6]:
mx_submits_fe.add_continuous_visit_years()
mx_submits_fe.get_unique_value_counts("continuous_visit_years")

                                                                                

+----------------------+------+--------------------+
|continuous_visit_years| count|          percentage|
+----------------------+------+--------------------+
|                     1|224597|   90.80716763566676|
|                     2| 19491|   7.880436979954232|
|                     3|  2506|  1.0132048161595253|
|                     4|   530| 0.21428513669774474|
|                     5|   115|0.046495831547623864|
|                     6|    35|0.014150905253624654|
|                     7|    16|0.006468985258799842|
|                     9|     8|0.003234492629399921|
|                     8|     8|0.003234492629399921|
|                    10|     6|0.002425869472049...|
|                    11|     5| 0.00202155789337495|
|                    12|     4|0.001617246314699...|
|                    13|     4|0.001617246314699...|
|                    14|     3|0.001212934736024...|
|                    15|     3|0.001212934736024...|
|                    18|     1|4.0431157867499

In [7]:
mx_submits_fe.filter_by_continuous_visit_years(2)



Dataframe post removing less than 2 continuous visits - Shape: 22737 rows, 131 columns


                                                                                

In [8]:
mx_submits_fe.add_comorbidities_with_exponential_decay_sparse_vector()

                                                                                

Unnamed: 0,previous_diagnosis_ohe
0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
mx_submits_fe.retain_columns(features_cols+[label_column])

In [10]:
mx_submits_fe.convert_columns_to_float(["claim_total_charge_amount"])
preprocess_data = mx_submits_fe.preprocess_data(exclude_cols=exclude_cols)
preprocess_data

Casted claim_total_charge_amount to float


                                                                                

One-Hot Encoding applied successfully to column: facility_provider_address_region
One-Hot Encoding applied successfully to column: patient_gender
One-Hot Encoding applied successfully to column: principal_diagnosis_body_part
One-Hot Encoding applied successfully to column: principal_diagnosis_category
Assembling all features into a vector with 6 columns.


[Stage 98:>                                                         (0 + 1) / 1]

Preprocessing complete. Feature vector created.


                                                                                

In [11]:
model_feature_col = [
    'principal_diagnosis_category_Factors influencing health status and contact with health services',
    # 'patient_location_residential_region_Northeast',
    'principal_diagnosis_category_Diseases of the circulatory system',
    'principal_diagnosis_category_Diseases of the respiratory system',
    'principal_diagnosis_category_Diseases of the musculoskeletal system and connective tissue',
    # 'principal_diagnosis_category_unknown',
    # 'patient_location_residential_region_West',
    'patient_gender_F',
    'principal_diagnosis_category_Endocrine nutritional and metabolic diseases',
    # 'principal_diagnosis_body_part_unknown',
    'principal_diagnosis_body_part_Spine',
    'principal_diagnosis_category_Diseases of the eye and adnexa',
    'principal_diagnosis_category_Diseases of the genitourinary system',
    'principal_diagnosis_category_Injury poisoning and certain other consequences of external causes',
    # 'patient_location_residential_region_South',
    # 'patient_location_residential_region_unknown',
    # 'facility_provider_address_region_unknown',
    'patient_gender_M',
    'principal_diagnosis_category_Symptoms signs and abnormal clinical laboratory findings not elsewhere classified',
    'principal_diagnosis_category_Mental Behavioral and Neurodevelopmental disorders',
    # 'patient_gender_U',
    # 'secondary_payer_state_unknown',
    'principal_diagnosis_category_Diseases of the nervous system',
    'Diagnosis_I10',
    # 'patient_location_residential_region_Midwest',
    'principal_diagnosis_body_part_Knee',
    'principal_diagnosis_category_Diseases of the skin and subcutaneous tissue',
    'principal_diagnosis_category_Diseases of the ear and mastoid process',
    'facility_provider_address_region_Northeast',
    'principal_diagnosis_body_part_Eye',
    'principal_diagnosis_category_Neoplasms',
    'facility_provider_address_region_South',
    'principal_diagnosis_body_part_Heart',
    'principal_diagnosis_body_part_Ear',
    'principal_diagnosis_body_part_Shoulder',
    'Diagnosis_E785',
    'principal_diagnosis_category_External causes of morbidity',
    'principal_diagnosis_category_Diseases of the digestive system',
    'facility_provider_address_region_West',
    'principal_diagnosis_body_part_Lung',
    'facility_provider_address_region_Midwest',
    'Diagnosis_N179',
    'Diagnosis_E119',
    'Diagnosis_R079',
    'Diagnosis_Z23',
    'principal_diagnosis_body_part_Hip',
    'Diagnosis_F200',
    'principal_diagnosis_category_Pregnancy childbirth and puerperium',
    'Diagnosis_Z87891',
    'principal_diagnosis_body_part_Foot',
    'Diagnosis_I129',
    'Diagnosis_F331',
    'Diagnosis_M109',
    'principal_diagnosis_category_Certain infections and parasitic diseases',
    'Diagnosis_J90',
    'principal_diagnosis_body_part_Leg non-joint',
    'Diagnosis_R000',
    'Diagnosis_R739',
    'Diagnosis_K219',
    'Diagnosis_Z951',
    'Diagnosis_R32',
    'principal_diagnosis_body_part_Foot and ankle',
    'Diagnosis_I509',
    'Diagnosis_E875',
    'Diagnosis_N281',
    # 'Diagnosis_S2242XA',
    'Diagnosis_I130',
    'principal_diagnosis_category_Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism',
    'principal_diagnosis_body_part_Hand',
    'Diagnosis_F17210',
    'Diagnosis_I214',
    'Diagnosis_Z931',
    'Diagnosis_Q909',
    'Diagnosis_I739',
    'Diagnosis_Z743',
    'Diagnosis_F418',
    'Diagnosis_F329',
    # 'secondary_payer_state_UT',
    'principal_diagnosis_body_part_Wrist',
    'Diagnosis_F17200',
    'Diagnosis_F209',
    'Diagnosis_M545',
    # 'secondary_payer_state_KY',
    'Diagnosis_E872',
    # 'secondary_payer_state_TX',
    'Diagnosis_A419',
    'principal_diagnosis_body_part_Finger',
    'Diagnosis_J189',
    'Diagnosis_Z794',
    'Diagnosis_I252',
    'Diagnosis_R262',
    'Diagnosis_D631',
    'Diagnosis_I82411',
    'Diagnosis_D638',
    'Diagnosis_R918',
    # 'secondary_payer_state_MO',
    'Diagnosis_N189',
    'Diagnosis_N186',
    'principal_diagnosis_category_Congenital malformations deformations and chromosomal abnormalities',
    'principal_diagnosis_body_part_Elbow',
    'Diagnosis_Z955',
    'Diagnosis_J810',
    'Diagnosis_I69322',
    'Diagnosis_I69351',
    'Diagnosis_R278',
    # 'Diagnosis_M479',
    # 'secondary_payer_state_GA',
    'Diagnosis_R279',
    # 'Diagnosis_S0101XA',
    # 'Diagnosis_S130XXA',
    'Diagnosis_F10229',
    'Diagnosis_I69959',
    'Diagnosis_D509',
    'Diagnosis_I361',
    'Diagnosis_N184',
    'Diagnosis_I110',
    'Diagnosis_M542',
    'Diagnosis_E669',
    'Diagnosis_G894',
    # 'Diagnosis_R578',
    # 'Diagnosis_S01112A',
    'Diagnosis_F840',
    'Diagnosis_Z00129',
    'Diagnosis_G309',
    'Diagnosis_G319',
    'Diagnosis_J441',
    # 'Diagnosis_S14125A',
    'Diagnosis_I712',
    # 'Diagnosis_S12500A',
    # 'Diagnosis_S240XXA',
    # 'Diagnosis_S12400A',
    # 'Diagnosis_S14123A',
    'Diagnosis_R202',
    'Diagnosis_K222',
    'Diagnosis_D649',
    'Diagnosis_Z452',
    # 'Diagnosis_V784XXA',
    'principal_diagnosis_category_Certain conditions originating in the perinatal period',
    'Diagnosis_K743',
    # 'Diagnosis_G904',
    'Diagnosis_J449',
    # 'Diagnosis_S0990XA',
    'Diagnosis_R620',
    'Diagnosis_Z789',
    # 'Diagnosis_S0191XA',
    'Diagnosis_E861',
    'Diagnosis_Z992',
    'Diagnosis_M549',
    'Diagnosis_I469',
    # 'Diagnosis_S1093XA',
    'Diagnosis_Z713',
    'Diagnosis_D72829',
    'Diagnosis_D62',
    # 'Diagnosis_M341',
    'Diagnosis_Z20822',
    'Diagnosis_R569',
    'Diagnosis_Z113',
    'Diagnosis_I447',
    'Diagnosis_E871',
    'Diagnosis_I480',
    'Diagnosis_N390',
    'principal_diagnosis_body_part_Arm non-joint',
    'principal_diagnosis_body_part_Ankle',
    'principal_diagnosis_body_part_Head',
    'Diagnosis_E440',
    # 'Diagnosis_R579',
    'Diagnosis_Z79899',
    'Diagnosis_M25551',
    'Diagnosis_R64',
    'Diagnosis_F251',
    'Diagnosis_H524',
    'principal_diagnosis_body_part_Toe',
    'Diagnosis_R55',
    'Diagnosis_Z993',
    'Diagnosis_Z95810',
    'Diagnosis_R634',
    'principal_diagnosis_body_part_Stomach',
    'Diagnosis_D508',
    'Diagnosis_R531',
    'principal_diagnosis_body_part_Various',
    'Diagnosis_H903',
    'Diagnosis_F39',
    # 'Diagnosis_S2191XA',
    'Diagnosis_X58XXXA',
    'Diagnosis_I120',
    'Diagnosis_M329',
    'Diagnosis_R54',
    'Diagnosis_Z139',
    'Diagnosis_J431',
    'Diagnosis_F250',
    'Diagnosis_C50511',
    # 'Diagnosis_S1091XA',
    # 'Diagnosis_L89310',
    'Diagnosis_F419',
    'Diagnosis_I959',
    'principal_diagnosis_body_part_Leg',
    # 'Diagnosis_S31119A',
    'Diagnosis_J309',
    'Diagnosis_E11621',
    'Diagnosis_N529',
    # 'Diagnosis_R402432',
    'Diagnosis_M25571',
    'Diagnosis_I253',
    'Diagnosis_N939',
    'Diagnosis_S31020A',
    'Diagnosis_N401',
    'Diagnosis_R69',
    'Diagnosis_Z95828',
    # 'secondary_payer_state_MA',
    'Diagnosis_R410',
    'Diagnosis_R600',
    'Diagnosis_E782',
    'Diagnosis_R52',
    'Diagnosis_M546',
    'Diagnosis_Z888'
]


In [12]:
len(model_feature_col)

176

In [13]:
mx_submits_fe.expand_features(model_feature_col)

Created OHE column: facility_provider_address_region_Northeast (index: 1)
Created OHE column: facility_provider_address_region_South (index: 2)
Created OHE column: facility_provider_address_region_Midwest (index: 3)
Created OHE column: facility_provider_address_region_West (index: 4)
Created OHE column: patient_gender_F (index: 0)
Created OHE column: patient_gender_M (index: 1)
Created OHE column: principal_diagnosis_body_part_Spine (index: 1)
Created OHE column: principal_diagnosis_body_part_Eye (index: 2)
Created OHE column: principal_diagnosis_body_part_Knee (index: 3)
Created OHE column: principal_diagnosis_body_part_Ear (index: 4)
Created OHE column: principal_diagnosis_body_part_Shoulder (index: 5)
Created OHE column: principal_diagnosis_body_part_Heart (index: 6)
Created OHE column: principal_diagnosis_body_part_Hip (index: 7)
Created OHE column: principal_diagnosis_body_part_Foot (index: 8)
Created OHE column: principal_diagnosis_body_part_Foot and ankle (index: 9)
Created OHE 

In [14]:
mx_submits_fe.retain_columns(model_feature_col+[label_column])

In [15]:
mx_submits_fe.dataframe.columns

['principal_diagnosis_category_Factors influencing health status and contact with health services',
 'principal_diagnosis_category_Diseases of the circulatory system',
 'principal_diagnosis_category_Diseases of the respiratory system',
 'principal_diagnosis_category_Diseases of the musculoskeletal system and connective tissue',
 'patient_gender_F',
 'principal_diagnosis_category_Endocrine nutritional and metabolic diseases',
 'principal_diagnosis_body_part_Spine',
 'principal_diagnosis_category_Diseases of the eye and adnexa',
 'principal_diagnosis_category_Diseases of the genitourinary system',
 'principal_diagnosis_category_Injury poisoning and certain other consequences of external causes',
 'patient_gender_M',
 'principal_diagnosis_category_Symptoms signs and abnormal clinical laboratory findings not elsewhere classified',
 'principal_diagnosis_category_Mental Behavioral and Neurodevelopmental disorders',
 'principal_diagnosis_category_Diseases of the nervous system',
 'Diagnosis_I

In [16]:
model_data = mx_submits_fe.preprocess_features(model_feature_col, label_column)
model_data

DataFrame[features: vector, claim_total_charge_amount: float]

# Model Training

## XGB Model

In [17]:
xgb_model = XGBoostModelBuilder(model_data, model_feature_col, label_column)

In [18]:
train_df, test_df = xgb_model.split_data()

In [19]:
xgb_model.train_model()
xgb_model.save_model("../output/model/XGB_model_cont_2")

24/10/18 02:52:49 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
2024-10-18 02:52:49,585 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'max_depth': 3, 'eta': 0.1, 'num_round': 100, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
[02:52:59] task 0 got new rank 0                                    (0 + 1) / 1]
Parameters: { "num_round" } are not used.

2024-10-18 02:53:01,123 INFO XGBoost-PySpark: _fit Finished xgboost training!   


Model 'XGB_model' saved to ../output/model/XGB_model_cont_2


In [20]:
# xgb_model = xgb_model.load_model(model_data, model_feature_col, label_column, xgb_model.model_name,path="../output/model/XGB_model")

In [21]:
xgb_model.evaluate_model(type="Train")

24/10/18 02:53:03 WARN DAGScheduler: Broadcasting large task binary with size 1094.4 KiB
INFO:XGBoost-PySpark:Do the inference on the CPUs                  (0 + 8) / 12]
2024-10-18 02:53:06,515 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:07,259 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:07,359 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:07,508 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:07,648 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:07,932 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:08,366 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:09,013 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:09,638 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:09,740 INFO XGBoost-PySpark: p

8378.840657194729

In [22]:
xgb_model.evaluate_model(type="Test")

24/10/18 02:53:13 WARN DAGScheduler: Broadcasting large task binary with size 1094.4 KiB
2024-10-18 02:53:14,651 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:14,832 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:16,715 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:17,647 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:18,192 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                  (5 + 7) / 12]
2024-10-18 02:53:19,097 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:20,259 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:20,474 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:20,658 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:21,181 INFO XGBoost-PySpark: p

6872.892209855708

In [23]:
xgb_model.calculate_mape(type="Train")

24/10/18 02:53:24 WARN DAGScheduler: Broadcasting large task binary with size 1089.2 KiB
2024-10-18 02:53:25,594 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:25,626 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:25,832 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:26,132 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:26,347 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:26,545 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:26,740 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:28,045 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:29,190 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:29,446 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-10-18 02:53:29,452 INFO XGBoost-PySp

12575.212458085285

In [24]:
xgb_model.feature_importance()

[('Diagnosis_F331', 83.0),
 ('patient_gender_F', 54.0),
 ('Diagnosis_X58XXXA', 39.0),
 ('Diagnosis_D638', 32.0),
 ('principal_diagnosis_category_Diseases of the circulatory system', 28.0),
 ('patient_gender_M', 27.0),
 ('principal_diagnosis_category_Factors influencing health status and contact with health services',
  25.0),
 ('principal_diagnosis_category_Pregnancy childbirth and puerperium', 18.0),
 ('principal_diagnosis_body_part_Leg non-joint', 17.0),
 ('Diagnosis_I739', 17.0),
 ('Diagnosis_N189', 14.0),
 ('principal_diagnosis_category_Endocrine nutritional and metabolic diseases',
  13.0),
 ('principal_diagnosis_category_Diseases of the skin and subcutaneous tissue',
  13.0),
 ('principal_diagnosis_category_Mental Behavioral and Neurodevelopmental disorders',
  12.0),
 ('principal_diagnosis_category_Diseases of the respiratory system', 11.0),
 ('principal_diagnosis_category_Diseases of the genitourinary system', 11.0),
 ('facility_provider_address_region_Midwest', 11.0),
 ('princ

## KMeans

In [19]:
kmeans_builder = KMeansBuilder(model_data=model_data)

In [18]:
wcss_values = kmeans_builder.optimal_k(clusters=[300, 700, 1000])
wcss_values

24/10/21 02:28:59 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/10/21 02:29:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/10/21 02:29:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/10/21 02:29:26 WARN DAGScheduler: Broadcasting large task binary with size 1394.9 KiB
24/10/21 02:29:30 WARN DAGScheduler: Broadcasting large task binary with size 1364.1 KiB
24/10/21 02:30:02 WARN DAGScheduler: Broadcasting large task binary with size 1880.2 KiB
24/10/21 02:30:09 WARN DAGScheduler: Broadcasting large task binary with size 1849.4 KiB
24/10/21 02:30:49 WARN DAGScheduler: Broadcasting large task binary with size 1976.2 KiB

Optimal k  : 300


24/10/21 02:30:54 WARN DAGScheduler: Broadcasting large task binary with size 1945.4 KiB
                                                                                

[1173.8905668725401, 71.6404543466004, 0.0005893665601544039]

In [20]:
kmeans_builder.fit_model(k = 500)

24/10/22 00:27:49 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/10/22 00:28:10 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/10/22 00:28:10 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/10/22 00:28:14 WARN DAGScheduler: Broadcasting large task binary with size 1685.1 KiB
24/10/22 00:28:18 WARN DAGScheduler: Broadcasting large task binary with size 1643.5 KiB
                                                                                

KMeansModel: uid=KMeans_aa06e29c2f10, k=500, distanceMeasure=euclidean, numFeatures=176

In [22]:
kmeans_builder.save_model("../output/model/KMeans_500")

In [23]:
kmeans_builder.get_model_summary()

<pyspark.ml.clustering.KMeansSummary at 0x175155bd0>

In [24]:
kmeans_builder.evaluate_model(type="Test")

24/10/22 02:20:04 WARN DAGScheduler: Broadcasting large task binary with size 1693.1 KiB
24/10/22 02:20:11 WARN DAGScheduler: Broadcasting large task binary with size 1704.8 KiB
                                                                                

{'Train : inertia': 222.41416860375665, 'silhouette_score': 0.957319478711742}

In [28]:
kmeans_builder.evaluate_model(type="Train")

24/10/22 02:23:10 WARN DAGScheduler: Broadcasting large task binary with size 1693.1 KiB
24/10/22 02:23:18 WARN DAGScheduler: Broadcasting large task binary with size 1704.8 KiB
                                                                                

{'Train : inertia': 222.41416860375665, 'silhouette_score': 0.9756933869133365}

In [36]:
cluster_averages = kmeans_builder.get_cluster_averages(column="claim_total_charge_amount")
cluster_averages

24/10/22 02:34:28 WARN DAGScheduler: Broadcasting large task binary with size 1690.8 KiB
24/10/22 02:34:33 WARN DAGScheduler: Broadcasting large task binary with size 1653.3 KiB
24/10/22 02:34:35 WARN DAGScheduler: Broadcasting large task binary with size 1690.8 KiB

Cluster averages for claim_total_charge_amount:
     cluster_number  claim_total_charge_amount_avg_train  train_size  \
76              423                           532.322495           8   
345             420                           841.000000           1   
308             382                           997.792222          18   
17              428                          1855.752005          10   
335             469                          2451.799998           6   
..              ...                                  ...         ...   
226             158                           131.259995           1   
58              229                           416.180801          25   
333             497                           122.750000           1   
332             487                            60.000000           1   
281              56                           505.482999          10   

     claim_total_charge_amount_avg_test  test_size    difference  percent_diff  
76    

24/10/22 02:34:39 WARN DAGScheduler: Broadcasting large task binary with size 1653.3 KiB
                                                                                

Unnamed: 0,cluster_number,claim_total_charge_amount_avg_train,train_size,claim_total_charge_amount_avg_test,test_size,difference,percent_diff
76,423,532.322495,8,2.580000,1,529.742495,99.515331
345,420,841.000000,1,21.000000,1,820.000000,97.502973
308,382,997.792222,18,43.040001,2,954.752221,95.686477
17,428,1855.752005,10,92.000000,1,1763.752005,95.042441
335,469,2451.799998,6,132.000000,1,2319.799998,94.616200
...,...,...,...,...,...,...,...
226,158,131.259995,1,1520.000000,1,-1388.740005,-1058.007057
58,229,416.180801,25,5617.500000,2,-5201.319199,-1249.773942
333,497,122.750000,1,2755.500000,1,-2632.750000,-2144.806517
332,487,60.000000,1,1557.000000,1,-1497.000000,-2495.000000


In [37]:
cluster_averages.iloc[-30:]

Unnamed: 0,cluster_number,claim_total_charge_amount_avg_train,train_size,claim_total_charge_amount_avg_test,test_size,difference,percent_diff
201,312,7176.682449,12,20645.365234,2,-13468.682785,-187.672826
59,126,413.734761,21,1191.942497,4,-778.207736,-188.09339
322,401,400.458003,15,1163.25,3,-762.791997,-190.479899
285,114,126.75,4,369.0,2,-242.25,-191.12426
338,426,186.0,2,546.0,1,-360.0,-193.548387
125,125,1059.068344,42,3150.092802,7,-2091.024458,-197.439992
88,90,498.911114,45,1502.8775,12,-1003.966386,-201.231514
265,93,339.233333,27,1266.8,5,-927.566667,-273.430284
302,267,463.668572,7,1780.0,3,-1316.331428,-283.8949
4,471,403.92,7,1555.174988,4,-1151.254987,-285.020545


## KMeans

In [58]:
kmeans_builder = KMeansBuilder(model_data=model_data)

In [59]:
kmeans_builder.fit_model(k = 100)

24/10/22 03:08:25 WARN DAGScheduler: Broadcasting large task binary with size 1128.8 KiB
24/10/22 03:08:28 WARN DAGScheduler: Broadcasting large task binary with size 1087.2 KiB
                                                                                

KMeansModel: uid=KMeans_7b8ca69ef8a6, k=100, distanceMeasure=euclidean, numFeatures=176

In [60]:
kmeans_builder.save_model("../output/model/KMeans_100")

In [61]:
kmeans_builder.evaluate_model(type="Test")

24/10/22 03:08:32 WARN DAGScheduler: Broadcasting large task binary with size 1133.7 KiB
24/10/22 03:08:37 WARN DAGScheduler: Broadcasting large task binary with size 1145.4 KiB
                                                                                

{'Train : inertia': 5759.416619951578, 'silhouette_score': 0.6742656743045236}

In [62]:
kmeans_builder.evaluate_model(type="Train")

24/10/22 03:08:43 WARN DAGScheduler: Broadcasting large task binary with size 1133.7 KiB
24/10/22 03:08:49 WARN DAGScheduler: Broadcasting large task binary with size 1145.4 KiB
                                                                                

{'Train : inertia': 5759.416619951578, 'silhouette_score': 0.6733231336282451}

In [63]:
cluster_averages = kmeans_builder.get_cluster_averages(column="claim_total_charge_amount")
cluster_averages

24/10/22 03:08:56 WARN DAGScheduler: Broadcasting large task binary with size 1131.5 KiB
24/10/22 03:09:00 WARN DAGScheduler: Broadcasting large task binary with size 1093.9 KiB
24/10/22 03:09:02 WARN DAGScheduler: Broadcasting large task binary with size 1131.4 KiB
24/10/22 03:09:06 WARN DAGScheduler: Broadcasting large task binary with size 1093.9 KiB
                                                                                

Cluster averages for claim_total_charge_amount:
    cluster_number  claim_total_charge_amount_avg_train  train_size  \
18              69                          1523.133555          96   
0               12                          3027.764033         256   
41              89                          2179.858748          56   
82               4                           947.683616         312   
95              77                           629.672222          18   
..             ...                                  ...         ...   
75              19                           414.872791          43   
69              52                           800.243999         113   
47              51                           880.345253          61   
49              39                           406.620019          83   
76              36                          1655.510262          77   

    claim_total_charge_amount_avg_test  test_size   difference  percent_diff  
18                  

Unnamed: 0,cluster_number,claim_total_charge_amount_avg_train,train_size,claim_total_charge_amount_avg_test,test_size,difference,percent_diff
18,69,1523.133555,96,279.498236,17,1243.635320,81.649788
0,12,3027.764033,256,853.283150,60,2174.480882,71.818043
41,89,2179.858748,56,624.823895,18,1555.034853,71.336496
82,4,947.683616,312,297.621476,61,650.062140,68.594848
95,77,629.672222,18,207.500000,2,422.172222,67.046347
...,...,...,...,...,...,...,...
75,19,414.872791,43,871.822500,8,-456.949710,-110.142125
69,52,800.243999,113,1830.529652,27,-1030.285653,-128.746439
47,51,880.345253,61,2093.925419,11,-1213.580166,-137.852753
49,39,406.620019,83,1544.487699,13,-1137.867680,-279.835627


In [64]:
cluster_averages.iloc[-20:]

Unnamed: 0,cluster_number,claim_total_charge_amount_avg_train,train_size,claim_total_charge_amount_avg_test,test_size,difference,percent_diff
25,56,2358.151166,329,3310.972444,81,-952.821277,-40.405437
30,71,544.511538,143,768.78067,30,-224.269132,-41.187214
62,11,876.888913,193,1247.652646,49,-370.763733,-42.281722
94,30,410.688292,41,590.340999,10,-179.652706,-43.744297
31,44,654.826855,415,955.233836,86,-300.406981,-45.875788
55,58,299.646551,29,437.207003,10,-137.560452,-45.90757
83,79,3384.609294,87,5194.354023,23,-1809.744729,-53.469827
90,96,380.250197,51,604.222222,18,-223.972025,-58.901225
8,16,1248.048749,661,2160.511159,161,-912.46241,-73.111119
54,9,891.904834,95,1575.520707,28,-683.615873,-76.646728


In [65]:
kmeans_builder.get_feature_importance(model_feature_col)

Feature Importance based on cluster centroids:
                                                                                              feature  \
Diagnosis_F331                                                                         Diagnosis_F331   
patient_gender_F                                                                     patient_gender_F   
patient_gender_M                                                                     patient_gender_M   
facility_provider_address_region_South                         facility_provider_address_region_South   
principal_diagnosis_category_Diseases of the mu...  principal_diagnosis_category_Diseases of the m...   
...                                                                                               ...   
Diagnosis_Z789                                                                         Diagnosis_Z789   
Diagnosis_R620                                                                         Diagnosis_R620   
Diagnosi

Unnamed: 0,feature,mean_importance,std_dev_importance
Diagnosis_F331,Diagnosis_F331,0.446396,0.488664
patient_gender_F,patient_gender_F,0.426507,0.434166
patient_gender_M,patient_gender_M,0.392921,0.426514
facility_provider_address_region_South,facility_provider_address_region_South,0.193407,0.393248
principal_diagnosis_category_Diseases of the musculoskeletal system and connective tissue,principal_diagnosis_category_Diseases of the m...,0.153141,0.357240
...,...,...,...
Diagnosis_Z789,Diagnosis_Z789,0.000000,0.000000
Diagnosis_R620,Diagnosis_R620,0.000000,0.000000
Diagnosis_J449,Diagnosis_J449,0.000000,0.000000
Diagnosis_K743,Diagnosis_K743,0.000000,0.000000


## MLP Classifier

In [14]:
mlp_builder = MLPModelBuilder(model_data, model_feature_col, label_column)

In [15]:
model_feature_col

['principal_diagnosis_category_Factors influencing health status and contact with health services',
 'patient_location_residential_region_Northeast',
 'principal_diagnosis_category_Diseases of the circulatory system',
 'principal_diagnosis_category_Diseases of the respiratory system',
 'principal_diagnosis_category_Diseases of the musculoskeletal system and connective tissue',
 'principal_diagnosis_category_unknown',
 'patient_location_residential_region_West',
 'patient_gender_F',
 'principal_diagnosis_category_Endocrine nutritional and metabolic diseases',
 'principal_diagnosis_body_part_unknown',
 'principal_diagnosis_body_part_Spine',
 'principal_diagnosis_category_Diseases of the eye and adnexa',
 'principal_diagnosis_category_Diseases of the genitourinary system',
 'principal_diagnosis_category_Injury poisoning and certain other consequences of external causes',
 'patient_location_residential_region_South',
 'patient_location_residential_region_unknown',
 'facility_provider_addre

In [23]:
mlp_builder.train_df, mlp_builder.test_df = mlp_builder.split_data()

In [17]:
mlp_builder.bin_labels(num_bins=3)

                                                                                

In [18]:
# layers = [len(mlp_builder.feature_columns), 5, 4, 3] 
# mlp_builder.train_model(layers)

24/10/15 01:25:50 WARN DAGScheduler: Broadcasting large task binary with size 1733.4 KiB
24/10/15 01:50:03 WARN DAGScheduler: Broadcasting large task binary with size 1736.4 KiB
24/10/15 01:50:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/10/15 01:50:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/10/15 01:50:04 WARN DAGScheduler: Broadcasting large task binary with size 1737.4 KiB
24/10/15 01:50:05 WARN DAGScheduler: Broadcasting large task binary with size 1736.4 KiB
24/10/15 01:50:05 WARN DAGScheduler: Broadcasting large task binary with size 1737.4 KiB
24/10/15 01:50:05 WARN DAGScheduler: Broadcasting large task binary with size 1736.4 KiB
24/10/15 01:50:05 WARN DAGScheduler: Broadcasting large task binary with size 1737.4 KiB
24/10/15 01:50:06 WARN DAGScheduler: Broadcasting large task binary with size 1736.4 KiB
24/10/15 01:50:06 WARN DAGScheduler: Broadcasting large task binary wit

MultilayerPerceptronClassificationModel: uid=MultilayerPerceptronClassifier_1d369a285971, numLayers=4, numClasses=3, numFeatures=215

In [20]:
# mlp_builder.save_model("../output/model/MLPModel")

In [20]:
mlp_builder = mlp_builder.load_model(model_data, model_feature_col, label_column,path = "../output/model/MLPModel")

In [21]:
mlp_builder.evaluate_model(type="Train")

24/10/15 05:11:58 WARN DAGScheduler: Broadcasting large task binary with size 1732.4 KiB
                                                                                

0.4183189785586058

In [22]:
mlp_builder.evaluate_model(type="Test")

24/10/15 05:33:57 WARN DAGScheduler: Broadcasting large task binary with size 1732.4 KiB
                                                                                

0.41645249459470485

In [23]:
avg_claim_train = mlp_builder.average_claim_by_bin_train()
avg_claim_train.show()

24/10/15 05:57:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/10/15 05:57:05 WARN DAGScheduler: Broadcasting large task binary with size 1707.3 KiB
24/10/15 06:22:22 WARN DAGScheduler: Broadcasting large task binary with size 1675.8 KiB
                                                                                

+------------+--------------------+
|label_binned|average_claim_amount|
+------------+--------------------+
|         1.0|  242.79556904862184|
|         0.0|    83.5966601299861|
|         2.0|  3583.0215044044608|
+------------+--------------------+



In [25]:
avg_claim_test = mlp_builder.average_claim_by_bin_test()
avg_claim_test.show()

24/10/15 09:31:15 WARN DAGScheduler: Broadcasting large task binary with size 1707.3 KiB

+------------+--------------------+
|label_binned|average_claim_amount|
+------------+--------------------+
|         1.0|  242.14374536013332|
|         0.0|    84.0827847817373|
|         2.0|  3487.9150747244967|
+------------+--------------------+



24/10/15 09:57:26 WARN DAGScheduler: Broadcasting large task binary with size 1675.8 KiB
                                                                                

In [27]:
avg_claim_pred_train = mlp_builder.average_claim_by_predicted_bin_train(num_bins=3)
avg_claim_pred_train.show()

24/10/15 14:16:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/10/15 14:16:45 WARN DAGScheduler: Broadcasting large task binary with size 1757.2 KiB
24/10/15 14:22:45 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/10/15 14:22:46 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/10/15 14:45:45 WARN DAGScheduler: Broadcasting large task binary with size 1723.9 KiB
                                                                                

+-------------+--------------------+
|predicted_bin|average_claim_amount|
+-------------+--------------------+
|            0|  1307.2307887574923|
+-------------+--------------------+



In [28]:
avg_claim_pred_test = mlp_builder.average_claim_by_predicted_bin_test(num_bins=3)
avg_claim_pred_test.show()

24/10/15 14:45:51 WARN DAGScheduler: Broadcasting large task binary with size 1756.9 KiB
24/10/15 15:09:33 WARN DAGScheduler: Broadcasting large task binary with size 1722.1 KiB
                                                                                

+-------------+--------------------+
|predicted_bin|average_claim_amount|
+-------------+--------------------+
|            0|  1272.9569344277793|
+-------------+--------------------+

