<a href="https://colab.research.google.com/github/Jiayu415/Biostat285-Project/blob/main/Biostats_285_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Data Reprocessing (Filtered_chronic_disease_patients.csv)

In [None]:
import pandas as pd
import numpy as np

# ✅ Load datasets
patients_df = pd.read_csv('patients.csv.gz')
admissions_df = pd.read_csv('admissions.csv.gz')
diagnoses_df = pd.read_csv('diagnoses_icd.csv.gz')  # Ensure this dataset contains 'subject_id'

# ✅ Debug: Print column names to ensure they are correctly loaded
print("\n📌 Patients DF Columns:", patients_df.columns.tolist())
print("\n📌 Diagnoses DF Columns:", diagnoses_df.columns.tolist())

# ✅ Ensure column names have no extra spaces
patients_df.columns = patients_df.columns.str.strip().str.lower()
diagnoses_df.columns = diagnoses_df.columns.str.strip().str.lower()

# ✅ Verify 'subject_id' exists in both datasets
if 'subject_id' not in diagnoses_df.columns:
    raise KeyError("❌ 'subject_id' is missing in diagnoses_df! Check if you have the right dataset.")

# ✅ Check correct column names in patients_df
actual_age_col = 'anchor_age' if 'anchor_age' in patients_df.columns else 'age'
actual_gender_col = 'gender' if 'gender' in patients_df.columns else 'sex'

# ✅ Chronic disease ICD codes with corresponding disease names
disease_codes = {
    'diabetes': ['250', 'E08', 'E09', 'E10', 'E11', 'E13'],
    'heart_disease': ['410', '411', '412', '413', '414', 'I20', 'I21', 'I22', 'I23', 'I24', 'I25'],
    'kidney_disease': ['580', '581', '585', 'N17', 'N18', 'N19'],
    'asthma': ['493', 'J45'],
    'depression': ['296.2', '296.3', '300.4', 'F32', 'F33', 'F34'],
    'arthritis': ['710', '711', '712', '713', '714', 'M05', 'M06', 'M07', 'M15', 'M16', 'M17', 'M18', 'M19'],
    'cancer': ['140', '141', '142', '143', '144', '145', 'C00', 'C01', 'C02', 'C03', 'C04', 'C05', 'C06', 'C07', 'C08', 'C09', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C30', 'C31', 'C32', 'C33', 'C34', 'C35', 'C36', 'C37', 'C38', 'C39', 'C40', 'C41', 'C42', 'C43', 'C44', 'C45', 'C46', 'C47', 'C48', 'C49', 'C50', 'C51', 'C52', 'C53', 'C54', 'C55', 'C56', 'C57', 'C58', 'C60', 'C61', 'C62', 'C63', 'C64', 'C65', 'C66', 'C67', 'C68', 'C69', 'C70', 'C71', 'C72', 'C73', 'C74', 'C75', 'C76', 'C77', 'C78', 'C79', 'C80', 'C81', 'C82', 'C83', 'C84', 'C85', 'C86', 'C87', 'C88', 'C89', 'C90', 'C91', 'C92', 'C93', 'C94', 'C95', 'C96', 'C97'],
    'stroke': ['430', '431', '434', '436', 'I60', 'I61', 'I62', 'I63', 'I64', 'I69']
}

# ✅ Initialize empty list to collect filtered disease data
filtered_diseases_df_list = []

# ✅ Loop through each disease group and filter diagnoses
for disease, codes in disease_codes.items():
    # Filter the data for this disease
    disease_data = diagnoses_df[diagnoses_df['icd_code'].str.startswith(tuple(codes))]
    disease_data['disease_name'] = disease
    filtered_diseases_df_list.append(disease_data)

# ✅ Concatenate all disease data into a single DataFrame
filtered_diseases_df = pd.concat(filtered_diseases_df_list, axis=0)

# ✅ Merge with patient demographics
patient_data = pd.merge(filtered_diseases_df,
                        patients_df[['subject_id', actual_gender_col, actual_age_col]],
                        on='subject_id', how='left')

# ✅ Merge with race from admissions
patient_data = pd.merge(patient_data, admissions_df[['subject_id', 'race']],
                        on='subject_id', how='left')

print(f"\n✅ Patient Data (after merge) Shape: {patient_data.shape}")

# ✅ Drop duplicates
patient_data = patient_data.drop_duplicates(subset=['subject_id'])

# ✅ Randomly generate vitals & exercise data
np.random.seed(42)  # Reproducibility
patient_data['systolic_bp'] = np.random.randint(90, 180, size=len(patient_data))
patient_data['diastolic_bp'] = np.random.randint(60, 120, size=len(patient_data))
patient_data['heart_rate'] = np.random.randint(50, 120, size=len(patient_data))
patient_data['blood_sugar'] = np.random.randint(70, 200, size=len(patient_data))
patient_data['pulse'] = np.random.randint(60, 100, size=len(patient_data))
patient_data['exercise'] = np.random.choice([0, 1], size=len(patient_data))  # 0 = No, 1 = Yes
patient_data['family history'] = np.random.choice([0,1], size = len(patient_data)) # 0 = No, 1 = Yes

# ✅ Keep only final columns
final_columns = ['subject_id', actual_gender_col, actual_age_col, 'race', 'disease_name',
                 'systolic_bp', 'diastolic_bp', 'heart_rate', 'blood_sugar', 'pulse', 'exercise', 'family history']
final_data = patient_data[final_columns]

# ✅ Save final dataset
final_data.to_csv('filtered_chronic_disease_patients.csv', index=False)

# ✅ Print sample data
print("\n✅ Final Merged Data Sample:")
print(final_data.head())

## Patients Health Overview Dashboard

In [None]:
import ipywidgets as widgets
from IPython.display import display, HTML

def assess_health_status(blood_sugar=None, systolic_bp=None, diastolic_bp=None, heart_rate=None):
    status = "Normal"
    color = "green"
    explanation = ""


    if blood_sugar is not None:
        if blood_sugar > 180:
            status = "Diabetes (Critical)"
            color = "red"
            explanation += "Blood Sugar is critically high. Possible Diabetes.<br>"
        elif blood_sugar > 140:
            status = "Pre-Diabetes (At Risk)"
            color = "yellow"
            explanation += "Blood Sugar is elevated. Risk of Diabetes.<br>"
        else:
            explanation += "Blood Sugar is in the normal range.<br>"


    if systolic_bp is not None and diastolic_bp is not None:
        if systolic_bp > 160 or diastolic_bp > 100:
            status = "Hypertension (Critical)"
            color = "red"
            explanation += "Blood Pressure is critically high. Possible Hypertension.<br>"
        elif systolic_bp > 140 or diastolic_bp > 90:
            status = "Pre-Hypertension (At Risk)"
            color = "yellow"
            explanation += "Blood Pressure is elevated. Risk of Hypertension.<br>"
        else:
            explanation += "Blood Pressure is in the normal range.<br>"


    if heart_rate is not None:
        if heart_rate > 110:
            status = "Tachycardia (Critical)"
            color = "red"
            explanation += "Heart Rate is critically high. Possible Tachycardia.<br>"
        elif heart_rate > 100:
            status = "Elevated Heart Rate (At Risk)"
            color = "yellow"
            explanation += "Heart Rate is slightly elevated.<br>"
        else:
            explanation += "Heart Rate is in the normal range.<br>"


    html = f"""
    <div style='padding: 15px; border-radius: 5px; background-color: {color}; color: white; font-weight: bold; text-align: center;'>
        Health Status: {status}
    </div>
    <p>{explanation}</p>
    """
    return HTML(html)


blood_sugar_input = widgets.FloatText(value=88, description="Blood Sugar (mg/dL):", step=1)
systolic_bp_input = widgets.FloatText(value=120, description="Systolic BP:", step=1)
diastolic_bp_input = widgets.FloatText(value=80, description="Diastolic BP:", step=1)
heart_rate_input = widgets.FloatText(value=75, description="Heart Rate:", step=1)


def update_dashboard(_):
    display(assess_health_status(
        blood_sugar=blood_sugar_input.value,
        systolic_bp=systolic_bp_input.value,
        diastolic_bp=diastolic_bp_input.value,
        heart_rate=heart_rate_input.value
    ))

button = widgets.Button(description="Check Health Status")
button.on_click(update_dashboard)


display(HTML("<h2>🏥 Health Status Overview Dashboard</h2>"))
display(blood_sugar_input, systolic_bp_input, diastolic_bp_input, heart_rate_input, button)


update_dashboard(None)

patients vitals analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta


np.random.seed(42)


dates = pd.date_range(end=datetime.today(), periods=7).tolist()

data = {
    'date': dates,
    'systolic_bp': np.random.randint(110, 160, size=7),
    'diastolic_bp': np.random.randint(70, 100, size=7),
    'heart_rate': np.random.randint(60, 110, size=7),
    'blood_sugar': np.random.randint(80, 180, size=7),
    'pulse': np.random.randint(60, 100, size=7)
}

df = pd.DataFrame(data)


def plot_vital_trends(df):
    plt.figure(figsize=(12, 8))

1.
    plt.subplot(3, 2, 1)
    plt.plot(df['date'], df['systolic_bp'], marker='o', linestyle='-', label='Systolic BP')
    plt.title('Systolic BP Trend')
    plt.ylabel('mmHg')
    plt.xticks(rotation=45)
    plt.grid(True)

2.

    plt.subplot(3, 2, 2)
    plt.plot(df['date'], df['diastolic_bp'], marker='o', linestyle='-', label='Diastolic BP', color='orange')
    plt.title('Diastolic BP Trend')
    plt.ylabel('mmHg')
    plt.xticks(rotation=45)
    plt.grid(True)

3.
    plt.subplot(3, 2, 3)
    plt.plot(df['date'], df['heart_rate'], marker='o', linestyle='-', label='Heart Rate', color='green')
    plt.title('Heart Rate Trend')
    plt.ylabel('bpm')
    plt.xticks(rotation=45)
    plt.grid(True)

4.
    plt.subplot(3, 2, 4)
    plt.plot(df['date'], df['blood_sugar'], marker='o', linestyle='-', label='Blood Sugar', color='red')
    plt.title('Blood Sugar Trend')
    plt.ylabel('mg/dL')
    plt.xticks(rotation=45)
    plt.grid(True)

5.
    plt.subplot(3, 2, 5)
    plt.plot(df['date'], df['pulse'], marker='o', linestyle='-', label='Pulse', color='purple')
    plt.title('Pulse Trend')
    plt.ylabel('bpm')
    plt.xticks(rotation=45)
    plt.grid(True)

    plt.tight_layout()
    plt.show()


plot_vital_trends(df)

 Medication Adherence Tracking and Visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Simulated medication adherence data for one month (February 2024)
np.random.seed(42)
dates = pd.date_range(start="2024-02-01", end="2024-02-29", freq='D')  # February month
adherence = np.random.choice([1, 0], size=len(dates), p=[0.8, 0.2])  # 80% adherence rate

med_adherence_df = pd.DataFrame({
    'date': dates,
    'adherence': adherence
})

# 1️⃣ Bar Chart - Adherence Summary
def plot_adherence_bar_chart(med_adherence_df):
    adherence_summary = med_adherence_df['adherence'].value_counts().sort_index()
    adherence_summary.index = ['Missed', 'Taken']

    plt.figure(figsize=(8, 5))
    sns.barplot(x=adherence_summary.index, y=adherence_summary.values, palette=['#FF6F61', '#4CAF50'])
    plt.title('Medication Adherence Summary - February 2024', fontsize=14)
    plt.ylabel('Days Count', fontsize=12)
    plt.xlabel('Medication Status', fontsize=12)
    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

# 2️⃣ Calendar Heatmap - Daily Adherence (Improved Version)
def plot_adherence_calendar(med_adherence_df):
    med_adherence_df['day'] = med_adherence_df['date'].dt.day
    med_adherence_df['status'] = med_adherence_df['adherence'].map({1: 'Taken', 0: 'Missed'})

    plt.figure(figsize=(12, 3))  # Wider for better readability
    calendar_data = med_adherence_df.pivot_table(index=np.zeros(len(med_adherence_df)), columns='day', values='adherence')

    sns.heatmap(calendar_data,
                annot=med_adherence_df['status'].values.reshape(1, -1),
                fmt='',
                cmap=sns.color_palette(["#FF6F61", "#4CAF50"]),  # Red for missed, green for taken
                cbar=False,
                linewidths=1, linecolor='white', annot_kws={"size": 10, "rotation": 45, "ha": 'center'})

    plt.title('Medication Adherence Calendar - February 2024', fontsize=14, pad=15)
    plt.xticks(ticks=np.arange(0.5, len(dates)), labels=med_adherence_df['day'].tolist(), fontsize=10)
    plt.yticks([])  # No y-axis needed
    plt.show()

# Run the plots
plot_adherence_bar_chart(med_adherence_df)
plot_adherence_calendar(med_adherence_df)


##Disease Prediction Dashboard and Likelihood Prediction Dashboard

Disease prediction dashboard

In [None]:
import gradio as gr
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression

# Load Data
data3 = pd.read_csv('filtered_chronic_disease_patients.csv')
data3 = data3.rename(columns={'family history': 'family_history'})

# Encode categorical variables
encoder = LabelEncoder()
data3['gender'] = encoder.fit_transform(data3['gender'])
data3['race'] = encoder.fit_transform(data3['race'])
data3['exercise'] = encoder.fit_transform(data3['exercise'])
data3['family_history'] = encoder.fit_transform(data3['family_history'])

# Select Features
X = data3[['anchor_age', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'blood_sugar']]

# Encode Target Variable Properly
y = data3['disease_name']
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Check class balance
print("Class distribution in y:", np.unique(y, return_counts=True))

# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Model
model = LogisticRegression()
model.fit(X_scaled, y)

# Prediction Function
def predict_disease(anchor_age, systolic_bp, diastolic_bp, heart_rate, blood_sugar):
    input_data = np.array([[anchor_age, systolic_bp, diastolic_bp, heart_rate, blood_sugar]])
    input_scaled = scaler.transform(input_data)
    prediction = model.predict_proba(input_scaled)[:, 1]
    return f"Probability of Disease: {prediction[0]:.2f}"

# Gradio Interface
iface = gr.Interface(
    fn=predict_disease,
    inputs=[
        gr.Slider(0, 100, value=40, label="Age"),
        gr.Slider(50, 200, value=120, label="Systolic BP"),
        gr.Slider(30, 120, value=80, label="Diastolic BP"),
        gr.Slider(40, 150, value=70, label="Heart Rate"),
        gr.Slider(50, 300, value=100, label="Blood Sugar"),
    ],
    outputs="text",
    live=True
)

iface.launch()


Risk prediction Dashboard

In [None]:
import gradio as gr
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Load Data
data4 = pd.read_csv('filtered_chronic_disease_patients.csv')
data4 = data4.rename(columns={'family history': 'family_history'})

# Encode categorical variables
encoder = LabelEncoder()
data4['gender'] = encoder.fit_transform(data4['gender'])
data4['race'] = encoder.fit_transform(data4['race'])
data4['exercise'] = encoder.fit_transform(data4['exercise'])
data4['family_history'] = encoder.fit_transform(data4['family_history'])

# Select Features
X = data4[['anchor_age', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'blood_sugar']]

# Encode Target Variable (Multi-Class)
encoder = LabelEncoder()
y = encoder.fit_transform(data4['disease_name'])  # Now predicts specific diseases

# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Multi-Class Model
model = RandomForestClassifier()
model.fit(X_scaled, y)

# Prediction Function
def predict_disease(anchor_age, systolic_bp, diastolic_bp, heart_rate, blood_sugar):
    input_data = np.array([[anchor_age, systolic_bp, diastolic_bp, heart_rate, blood_sugar]])
    input_scaled = scaler.transform(input_data)
    prediction = model.predict(input_scaled)
    disease_name = encoder.inverse_transform(prediction)[0]  # Convert back to disease name
    return f"Predicted Disease: {disease_name}"

# Gradio Interface
iface = gr.Interface(
    fn=predict_disease,
    inputs=[
        gr.Slider(0, 100, value=40, label="Age"),
        gr.Slider(50, 200, value=120, label="Systolic BP"),
        gr.Slider(30, 120, value=80, label="Diastolic BP"),
        gr.Slider(40, 150, value=70, label="Heart Rate"),
        gr.Slider(50, 300, value=100, label="Blood Sugar"),
    ],
    outputs="text",
    live=True
)

iface.launch()

##Cost Estimation Dashboard

In [None]:
!pip install pandas scikit-learn gradio numpy

import pandas as pd
import numpy as np
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# ---- Load Dataset ----
file_path = "filtered_chronic_disease_patients.csv"  # Update with your actual path

try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
except Exception as e:
    print("Error loading dataset:", e)

# ---- Cost Estimation Model ----
df['medical_cost'] = np.random.randint(500, 5000, size=len(df))  # Generating synthetic cost data

X = df[['systolic_bp', 'diastolic_bp', 'heart_rate', 'blood_sugar']]
y = df['medical_cost']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

reg = RandomForestRegressor()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("Cost Estimation Model MAE:", mean_absolute_error(y_test, y_pred))

# ---- Gradio Interface ----
def predict_cost(systolic_bp, diastolic_bp, heart_rate, blood_sugar):
    input_data = np.array([[systolic_bp, diastolic_bp, heart_rate, blood_sugar]])
    estimated_cost = reg.predict(input_data)[0]
    return f"Estimated Medical Cost: ${estimated_cost:.2f}"

# Create Gradio UI
interface = gr.Interface(
    fn=predict_cost,
    inputs=[
        gr.Number(label="Systolic BP"),
        gr.Number(label="Diastolic BP"),
        gr.Number(label="Heart Rate"),
        gr.Number(label="Blood Sugar"),
    ],
    outputs="text",
    title="Medical Cost Estimator",
    description="Enter your health metrics to estimate medical costs.",
)

# Launch the Gradio app
interface.launch()