In [0]:
%python
# Load Spark DataFrames from gold views or stored tables (replace table/view names as per your environment)
df_hospital = spark.table("ushealthcaredynamics.gold.hospitallookup")
df_cpt = spark.table("ushealthcaredynamics.gold.cptcode_lookup")
df_date = spark.table("ushealthcaredynamics.gold.dimdate")
df_physician = spark.table("ushealthcaredynamics.gold.physcian_lookup")
df_payer = spark.table("ushealthcaredynamics.gold.payer_lookup")
df_diag = spark.table("ushealthcaredynamics.gold.diagnosiscode_lookup")
#df_speciality = spark.table("ushealthcaredynamics.gold.speciality_lookup")
df_transaction = spark.table("ushealthcaredynamics.gold.transaction_lookup")
df_fact = spark.table("ushealthcaredynamics.gold.facttable_lookup")
df_patient = spark.table("ushealthcaredynamics.gold.patient_lookup")


In [0]:
df = spark.sql('''SELECT
  f.dim_location_pk,
  h.location_name,
  f.dim_date_service_pk,
  dt.date_year,
  dt.date_month_number,
  SUM(f.insurance_payment) AS total_insurance_payment,
  SUM(f.patient_payment) AS total_patient_payment,
  SUM(f.ar) AS total_ar,
  SUM(f.gross_expenses) AS total_gross_charge,
  SUM(f.insurance_payment) AS total_revenue,
  CASE
    WHEN SUM(f.insurance_payment) + SUM(f.patient_payment) > 0
      THEN SUM(f.insurance_payment) / (SUM(f.insurance_payment) + SUM(f.patient_payment))
    ELSE 0
  END AS iptp_ratio,
  CASE
    WHEN SUM(f.gross_expenses) > 0
      THEN SUM(f.ar) / SUM(f.gross_expenses)
    ELSE 0
  END AS arge_ratio
FROM ushealthcaredynamics.gold.facttable_lookup f
LEFT JOIN ushealthcaredynamics.gold.hospitallookup h ON f.dim_location_pk = h.dim_location_pk
LEFT JOIN ushealthcaredynamics.gold.dimdate dt ON f.dim_date_service_pk = dt.date

GROUP BY f.dim_location_pk, h.location_name, dt.date_year, dt.date_month_number, f.dim_date_service_pk''')


In [0]:
display(df)

In [0]:
from pyspark.sql import functions as F

median_val = df.approxQuantile(
    "total_revenue",
    [0.5],
    0.01
)[0]

df = df.withColumn(
    "performance",
    F.when(
        F.col("total_revenue") >= median_val,
        F.lit(1)
    ).otherwise(F.lit(0))
)
display(df)

In [0]:
# Select only the required columns
selected_cols = [
    'total_insurance_payment', 'total_patient_payment', 'total_ar',
    'total_gross_charge', 'total_revenue', 'iptp_ratio', 'arge_ratio', 'performance'
]
df_selected = df.select(selected_cols)

# Split the DataFrame into train and test sets
train_df, test_df = df_selected.randomSplit([0.67, 0.33], seed=42)

display(train_df)
display(test_df)

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

# Assemble features into a single vector column
assembler = VectorAssembler(
    inputCols=[
        'total_insurance_payment', 'total_patient_payment', 'total_ar',
        'total_gross_charge', 'total_revenue', 'iptp_ratio', 'arge_ratio'
    ],
    outputCol='features'
)
train_df_assembled = assembler.transform(train_df)

# Fit Decision Tree model
dt = DecisionTreeClassifier(
    labelCol='performance',
    featuresCol='features',
    maxDepth=2
)
model = dt.fit(train_df_assembled)

display(model)

In [0]:
# Assemble features in test set
test_df_assembled = assembler.transform(test_df)

# Generate predictions
predictions = model.transform(test_df_assembled)

# Add predicted_performance column
result_df = predictions.withColumnRenamed('prediction', 'predicted_performance')

display(result_df)

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

data = pd.DataFrame({
    'dim_location_pk': [785724, 785623, 786431],
    'location_name': ['Big Heart Community Hospital', 'Twin Mountains Hospital', 'Guardian Medical Clinic'],
    'dim_date_service_pk': ['2020-01-21', '2020-04-20', '2020-05-31'],
    'total_insurance_payment': [14914, 12, 0],
    'total_patient_payment': [2466.08, 0, 22],
    'total_ar': [6074.13, 1.01, 186.74],
    'total_gross_charge': [23454.19, 13.01, 208.74],
    'total_revenue': [14914, 12, 0],
    'iptp_ratio': [0.8581, 1, 0],
    'arge_ratio': [0.2590, 0.0776, 0.8946]
})

data['performance'] = (data['total_revenue'] >= data['total_revenue'].median()).astype(int)
features = ['total_insurance_payment', 'total_patient_payment', 'total_ar', 'total_gross_charge', 'iptp_ratio', 'arge_ratio']

X = data[features]
y = data['performance']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
data.loc[X_test.index, 'predicted_performance'] = y_pred

print(data)


In [0]:
from sklearn.metrics import accuracy_score

# Train your classifier
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


In [0]:
# Train on entire data, no split
clf.fit(X, y)
y_pred = clf.predict(X)

# Show predictions and 'accuracy' as demo only
accuracy = accuracy_score(y, y_pred)
print(f"Model Accuracy (on all data): {accuracy:.2f}")
data['predicted_performance'] = y_pred
display(data)


In [0]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=2)
print(f"Cross-validated accuracy: {scores.mean():.2f}")

In [0]:
test_record = pd.DataFrame({
    'total_insurance_payment': [500],
    'total_patient_payment': [200],
    'total_ar': [400],
    'total_gross_charge': [200],
    'iptp_ratio': [0.27],
    'arge_ratio': [0.90]
})


In [0]:
y_pred_test = clf.predict(test_record)
print(f"Predicted Performance for test record: {'Top performing' if y_pred_test[0] == 1 else 'Lower performing'}")


In [0]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# 1. Replace this with your real or simulated dataset
data = pd.DataFrame({
    'location_name': ['Big Heart Community Hospital', 'Twin Mountains Hospital', 'Guardian Medical Clinic'],
    'total_insurance_payment': [14914, 12, 0],
    'total_patient_payment': [2466, 0, 22],
    'total_ar': [6074, 1, 186],
    'total_gross_charge': [23454, 13, 208],
    'total_revenue': [14914, 12, 0],
    'iptp_ratio': [0.86, 1.0, 0.0],
    'arge_ratio': [0.26, 0.08, 0.89]
})

# 2. Business logic for top performers (change thresholds as needed)
data['performance'] = (
    (data['iptp_ratio'] > 0.85)
    & (data['arge_ratio'] < 0.25)
    & (data['total_revenue'] > data['total_revenue'].median())
).astype(int)

# Alternatively: top 1 by revenue for demo
top_n = 1
data['top_by_revenue'] = data['total_revenue'].rank(method='max', ascending=False) <= top_n
data['performance'] = data['top_by_revenue'].astype(int)

# 3. Features and label
features = ['total_insurance_payment', 'total_patient_payment', 'total_ar', 'total_gross_charge', 'iptp_ratio', 'arge_ratio']
X = data[features]
y = data['performance']

# 4. Train model on all available data (tiny dataset!)
clf = DecisionTreeClassifier(max_depth=2, random_state=42)
clf.fit(X, y)
y_pred = clf.predict(X)

# 5. Show accuracy (for demo)
accuracy = accuracy_score(y, y_pred)
print(f"Model Accuracy (all data): {accuracy:.2f}")

# 6. Display predictions with data
data['predicted_performance'] = y_pred
print(data)

# 7. To test a new hospital record (sample test input)
test_record = pd.DataFrame({
    'total_insurance_payment': [500],
    'total_patient_payment': [1500],
    'total_ar': [200],
    'total_gross_charge': [200],
    'iptp_ratio': [0.60],
    'arge_ratio': [0.88]
})
test_pred = clf.predict(test_record)
print(f"Predicted Performance for test record: {'Top' if test_pred[0] == 1 else 'Lower'}")

# Use display(data) in Databricks or st.table(data) in Streamlit to show results.
