In [4]:
# loads all necessary libraries for the project.

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


import joblib

import warnings
warnings.filterwarnings('ignore')

import kagglehub

import gradio as gr
import joblib

import os

import shap

from google.colab import drive


print('All neccesary libriaries are imported')

All neccesary libriaries are imported


In [20]:
# loading the necessary data for gradio
drive.mount('/content/drive')

model = joblib.load('/content/drive/MyDrive/staking_model.joblib')

df = joblib.load("/content/drive/MyDrive/X_background.pkl")
preprocessor = joblib.load("/content/drive/MyDrive/preprocessor.pkl")
rf_pipeline = joblib.load("/content/drive/MyDrive/rf_model.joblib")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# create function for gradio
def predict_cost(
    gender, physical_activity_level, insurance_type, city_type,
    smoker, diabetes, hypertension, heart_disease, asthma, age, bmi,
    daily_steps, sleep_hours, stress_level,
    doctor_visits_per_year, hospital_admissions,
    medication_count, insurance_coverage_pct, previous_year_cost
):


    def yn_to_binary(x):
      if x == "Yes":
        return 1
      elif x == "No":
        return 0

    diabetes_val = yn_to_binary(diabetes)
    hypertension_val = yn_to_binary(hypertension)
    heart_disease_val = yn_to_binary(heart_disease)
    asthma_val = yn_to_binary(asthma)

   # -------------------- Build input --------------------
   # Single-row dataframe matching the training schema
    X_new = pd.DataFrame([{
        "gender": gender,
        "smoker": smoker,
        "physical_activity_level": physical_activity_level,
        "insurance_type": insurance_type,
        "city_type": city_type,
        "age": age,
        "bmi": bmi,
        "diabetes": diabetes_val,
        "hypertension": hypertension_val,
        "heart_disease": heart_disease_val,
        "asthma": asthma_val,
        "daily_steps": daily_steps,
        "sleep_hours": sleep_hours,
        "stress_level": stress_level,
        "doctor_visits_per_year": doctor_visits_per_year,
        "hospital_admissions": hospital_admissions,
        "medication_count": medication_count,
        "insurance_coverage_pct": insurance_coverage_pct,
        "previous_year_cost": previous_year_cost
    }])


    # Use the full ensemble for maximum accuracy
    prediction = model.predict(X_new)[0]

    # shap
    X_new_tr = preprocessor.transform(X_new)

    # Transform input using the already fitted preprocessor
    if hasattr(X_new_tr, "toarray"):
        X_new_tr = X_new_tr.toarray()


    # Explain the RandomForest component (interpretable surrogate)
    rf_model = rf_pipeline.named_steps["classifier"]
    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(X_new_tr)

    feature_names = preprocessor.get_feature_names_out()

    shap_df = pd.DataFrame({
        "feature": feature_names,
        "impact": shap_values[0]
    })

    # Keep only features that are active for this observation
    active_mask = pd.Series(X_new_tr[0] != 0, index=shap_df.index)
    shap_df = shap_df[active_mask]


    # Keep only factors that increase the cost
    shap_df["pct_contribution"] = (shap_df["impact"] / prediction) * 100

    positive_drivers = shap_df[
    shap_df["pct_contribution"] >= 0.5].sort_values(by="pct_contribution", ascending=False)

    #  Human-readable formatting
    def prettify_name(name):
        return (
            name.replace("_", " ")
                .replace("pct", "(%)")
                .title()
        )

    def humanize_feature_auto(feature_name):
        feature = feature_name.replace("num__", "").replace("cat__", "")
        if feature_name.startswith("cat__"):
            base, category = feature.rsplit("_", 1)
            return f"{prettify_name(base)} = {category}"
        return prettify_name(feature)

    if positive_drivers.empty:
        explanation = "No factors significantly increased the cost."
    else:
       explanation = "\n".join(
    f"â€¢ {humanize_feature_auto(r.feature)} contributed +{r.pct_contribution:.1f}% to the total cost"
    for _, r in positive_drivers.iterrows())

    return round(float(prediction), 2), explanation



In [7]:
examples = [
    [
        "Male",        # Gender (Dropdown)
        "Medium",      # Physical Activity Level
        "Private",     # Insurance Type
        "Urban",       # City Type
        "No",          # Smoker (Radio)
        'No',          # Diabetes
        'No',          # Hypertension
        'No',          # Heart Disease
        'No',          # Asthma
        30,            # Age (Slider)
        24,            # BMI
        8000,          # Daily Steps
        7,             # Sleep Hours
        4,             # Stress Level
        2,             # Doctor Visits
        0,             # Hospital Admissions
        3,             # Medication Count
        80,            # Insurance Coverage %
        12000          # Previous Year Cost
    ]]

In [22]:
# create gradio
gradio_interface = gr.Interface(
    fn=predict_cost,
    inputs=[
        gr.Dropdown(["Male", "Female"], label="Gender",value='Male'),
        gr.Dropdown(["Low", "Medium", "High"], label="Physical Activity Level", value='Medium'),
        gr.Dropdown(['Private','Government','Unknown'], label="Insurance Type", value = 'Private'),
        gr.Dropdown(['Semi-Urban', 'Urban', 'Rural'], label="City Type",value='Urban'),
        gr.Radio(["Yes", "No"], label="Smoker",value='No'),
        gr.Radio(['Yes','No'], value='No', label="Diabetes"),
        gr.Radio(['Yes','No'], value='No', label="Hypertension"),
        gr.Radio(['Yes','No'], value='No', label="Heart Disease"),
        gr.Radio(['Yes','No'], value='No', label="Asthma"),
        gr.Slider(0, 100, label="Age",value=df['age'].mean().round(-1), step=1),
        gr.Slider(0, 50, label="BMI",value=24),
        gr.Slider(0, 20000, value=df['daily_steps'].mean().round(-2), label="Daily Steps"),
        gr.Slider(0, 24, value = df['sleep_hours'].mean().round(0), label="Sleep Hours"),
        gr.Slider(0, 10, value =df['stress_level'].mean().round(0), label="Stress Level"),
        gr.Slider(0, 20, value=df['doctor_visits_per_year'].mean().round(0), label="Doctor Visits Per Year"),
        gr.Slider(0, 20,value= df['hospital_admissions'].mean().round(0), label="Hospital Admissions"),
        gr.Slider(0, 100,value = df['medication_count'].mean().round(0), label="Medication Count"),
        gr.Slider(0, 100, value= df['insurance_coverage_pct'].mean().round(-1), label="Insurance Coverage %"),
        gr.Slider(0, 100000, value =df['previous_year_cost'].mean().round(-2), label="Previous Year Cost")],

      outputs=[
          gr.Number(label='Cost prediction'),
          gr.Textbox(label='Explanation', lines=5)],
      title = 'Annual Medication Cost Prediction',
      examples=examples)

gradio_interface.launch(share=True,debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://71e32482e88a324983.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7862 <> https://71e32482e88a324983.gradio.live


