## Build the bot for ADA in Chicago 2025

In [None]:
import streamlit as st
import pandas as pd
import os
import pickle
import numpy as np
import openai
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError
from scipy.spatial.distance import cdist
from PIL import Image

# ✅ 1. Load Precomputed Data (Cluster Memberships and Features)
try:
    df_clusters = pd.read_csv("cluster_mapping.csv", encoding="utf-8")  # Ensure this file contains 'Cluster_LIME_Ordered'
    
    # Load pre-trained Logistic Regression model and scaler
    with open("logistic_regression.pkl", "rb") as f:
        clf = pickle.load(f)
    with open("scaler.pkl", "rb") as f:
        scaler = pickle.load(f)

except Exception as e:
    st.error(f"❌ Error loading files: {e}")
    st.stop()

# ✅ 2. Debugging: Ensure Model and Scaler Are Loaded Correctly
if not hasattr(clf, "coef_"):
    st.error("❌ The loaded classifier is not trained! Please re-train and save it.")
    st.stop()

if not isinstance(scaler, StandardScaler):
    st.error("❌ The loaded scaler is not a StandardScaler instance! Check `scaler.pkl`.")
    st.stop()

# ✅ 3. UI Styling
st.markdown(
    """
    <style>
    .main {
        background-color: #f5f5f5;
        padding: 2rem;
        border-radius: 10px;
    }
    .header {
        font-size: 2.5rem;
        font-weight: bold;
        text-align: center;
        color: #2c3e50;
        margin-bottom: 1rem;
    }
    .description {
        font-size: 1.1rem;
        color: #555555;
        text-align: center;
        margin-bottom: 2rem;
    }
    </style>
    """, unsafe_allow_html=True
)

# ✅ 4. Load and Display the Image
image = Image.open("predictive_clustering_with_diseases_20241226_ADA.jpg")
image = image.resize((500, 500)) 

# Layout with header and image
col1, col2 = st.columns([3, 2])
with col1:
    st.markdown('<div class="header">Understand Your Diabetes Risk</div>', unsafe_allow_html=True)
    st.markdown(
        """
        <div class="description">
        Enter your health details to assess your Type 2 Diabetes risk and get personalized health advice based on advanced machine learning analysis.
        </div>
        """, unsafe_allow_html=True
    )
with col2:
    st.image(image, use_container_width=True)

# ✅ 5. User Input Form
with st.form("user_input_form"):
    col1, col2, col3 = st.columns(3)
    with col1:
        fbg_input = st.text_input("Fasting Blood Glucose (mg/dL)", "100")
    with col2:
        hba1c_input = st.text_input("HbA1c (%)", "5.4")
    with col3:
        systolic_input = st.text_input("Systolic BP (mmHg)", "120")
    
    col1, col2, col3 = st.columns(3)
    with col1:
        diastolic_input = st.text_input("Diastolic BP (mmHg)", "80")
    with col2:
        height_input = st.text_input("Height (cm)", "170")  # 🔹 Replacing BMI
    with col3:
        weight_input = st.text_input("Weight (kg)", "70")  # 🔹 Replacing BMI
    
    col1, col2, col3 = st.columns(3)
    with col1:
        triglycerides_input = st.text_input("Triglycerides (mg/dL)", "130")
    with col2:
        hdl_input = st.text_input("HDL Cholesterol (mg/dL)", "55")
    with col3:
        ldl_input = st.text_input("LDL Cholesterol (mg/dL)", "100")
    
    col1, col2, col3 = st.columns(3)
    with col1:
        ast_input = st.text_input("AST (GOT) (U/L)", "30")
    with col2:
        alt_input = st.text_input("ALT (GPT) (U/L)", "30")
    with col3:
        gamma_input = st.text_input("Gamma-GTP (U/L)", "25")
    
    col1, col2 = st.columns(2)
    with col1:
        egfr_input = st.text_input("eGFR (mL/min/1.73m²)", "90")
    with col2:
        age_input = st.text_input("Age", "50")
    
    sex = st.selectbox("Sex", ["Male", "Female"])
    
    submitted = st.form_submit_button("Submit")

if submitted:
    # ✅ 6. Convert User Input to DataFrame
    try:
        # 🔹 Compute BMI dynamically
        bmi_value = float(weight_input) / ((float(height_input) / 100) ** 2)
        
        user_data = pd.DataFrame({
            "Systolic_BP": [float(systolic_input.strip())],
            "Diastolic_BP": [float(diastolic_input.strip())],
            "BMI": [bmi_value],  # 🔹 Using computed BMI
            "Triglycerides": [float(triglycerides_input.strip())],
            "HDL_Cholesterol": [float(hdl_input.strip())],
            "LDL_Cholesterol": [float(ldl_input.strip())],
            "AST(GOT)": [float(ast_input.strip())],
            "ALT(GPT)": [float(alt_input.strip())],
            "Gamma_GTP": [float(gamma_input.strip())],
            "eGFR": [float(egfr_input.strip())],
            "Age": [int(float(age_input.strip()))],  # 🔹 Ensure proper integer conversion
            "Sex": [1 if sex == "Male" else 0]
        })
    except ValueError:
        st.error("🚨 Please enter only numeric values in all input fields.")
        st.stop()

    # ✅ 7. Standardize User Data
    X_user_scaled = scaler.transform(user_data)

    # ✅ 8. Compute Risk Probability
    try:
        risk_probability = clf.predict_proba(X_user_scaled)[:, 1][0]
    except NotFittedError:
        st.error("Error: The classifier has not been fitted. Please re-train and save the model before running the app.")
        st.stop()

    # ✅ 9. Find the Closest Match Using Euclidean Distance
    expected_features = ["Systolic_BP", "Diastolic_BP", "BMI", "Triglycerides", 
                         "HDL_Cholesterol", "LDL_Cholesterol", "AST(GOT)", "ALT(GPT)", 
                         "Gamma_GTP", "eGFR", "Age", "Sex"]
    
    df_clusters_filtered = df_clusters[expected_features]
    X_scaled = scaler.transform(df_clusters_filtered)
    
    distances = cdist(X_user_scaled, X_scaled, metric='euclidean')
    closest_idx = np.argmin(distances)
    matched_individual = df_clusters.iloc[closest_idx]

    # ✅ 10. Assign Cluster Membership
    user_cluster = matched_individual['Cluster_LIME_Ordered']

    # ✅ 11. Display Cluster and Risk Information
    st.write("### 🏥 Your Risk Level")
    st.markdown(f"**Your calculated BMI is:** `{bmi_value:.1f}`")

    # ✅ Display user cluster info with colors (already implemented)

In [25]:
import pandas as pd
from lime_clustering_analysis import LimeClusteringAnalysis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pickle
import numpy as np
import os

# 1. Load the dataset
data = pd.read_csv('/Users/wang/Library/CloudStorage/Dropbox/000wsd/research/clustering/data/JMDC_T2D.csv', encoding='utf-8')

# 2. Preprocess the target variable: map "T2D" to 1 and "Non-T2D" to 0
data['T2D'] = data['T2D'].map({'T2D': 1, 'Non-T2D': 0})

# 3. Define the features used by the model.
# These feature names must match exactly with the columns in your dataset.
features = [
    #'Fasting_Blood_Glucose', 'HbA1c', 
    'Systolic_BP', 'Diastolic_BP', 'BMI',
    'Triglycerides', 'HDL_Cholesterol', 'LDL_Cholesterol', 'AST(GOT)', 'ALT(GPT)',
    'Gamma_GTP', 'eGFR', 'Age', 'Sex'
]
X = data[features]
y = data['T2D']

# 4. Standardize the features so that they have zero mean and unit variance.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Split the dataset into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 6. Train a Logistic Regression classifier.
clf = LogisticRegression(random_state=42, max_iter=1000)  # Increased iterations for convergence
# Drop columns 'Fasting_Blood_Glucose', 'HbA1c' from the training set
# because they are highly correlated with the target variable.
clf.fit(X_train, y_train)

# 7. Evaluate the model on the test set.
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

# 8. Save the trained classifier and scaler to disk for production use.
with open('logistic_regression.pkl', 'wb') as f:
    pickle.dump(clf, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("✅ Model and scaler saved successfully!")

Test Accuracy: 0.84
✅ Model and scaler saved successfully!


In [6]:
def get_individual_cluster_mapping(lca_instance):
    """
    Prepares a DataFrame containing '加入者id', LIME-based cluster assignments,
    and all other relevant feature columns.

    Parameters:
        - lca_instance (LimeClusteringAnalysis): An instance of LimeClusteringAnalysis.

    Returns:
        - pd.DataFrame: DataFrame with '加入者id', 'Cluster_LIME_Ordered', and all features.
    """
    # Ensure LIME cluster assignments exist
    if 'Cluster_Original_Ordered' not in lca_instance.lime_importances_df.columns:
        print("Running `compute_and_order_cluster_risks()` to ensure LIME cluster assignments exist.")
        lca_instance.compute_and_order_cluster_risks()

    # Restore individual IDs since they were removed from X_test
    X_test_with_id = lca_instance.X_test.copy()
    X_test_with_id['加入者id'] = lca_instance.ID_test

    # Merge LIME cluster assignments
    X_test_with_id['Cluster_LIME_Ordered'] = lca_instance.lime_importances_df['Cluster_Original_Ordered']

    # Select columns with IDs, cluster assignments, and features
    feature_columns = lca_instance.X_train.columns.tolist()
    selected_columns = ['加入者id', 'Cluster_LIME_Ordered'] + feature_columns

    return X_test_with_id[selected_columns]

In [15]:
import os
# Initialize the LIME clustering analysis
lca = LimeClusteringAnalysis(data, 
                             target_column="T2D", 
                             explainall=True, 
                             pca_components=8)

# Train logistic regression model (which initializes `scaler`)
lca.train_logistic_regression()

explainall = True
# Path to save the LIME importances
if explainall:   
    lime_importances_path = '/Users/wang/Library/CloudStorage/Dropbox/000wsd/research/clustering/lime_importances_jmdc_sexage_explainall.pkl'
else:
    lime_importances_path = '/Users/wang/Library/CloudStorage/Dropbox/000wsd/research/clustering/lime_importances_jmdc_sexage.pkl'

# Check if the LIME importances file exists for data
if os.path.exists(lime_importances_path):
    # Load the saved LIME importances
    lca.load_lime_importances(lime_importances_path)
else:
    # Generate LIME importances for data_jmdc and save them
    lca.generate_lime_importances(save_path=lime_importances_path)

Test accuracy of logistic model: 0.86
LIME importances loaded from /Users/wang/Library/CloudStorage/Dropbox/000wsd/research/clustering/lime_importances_jmdc_sexage_explainall.pkl


In [16]:
lca.perform_pca()

PCA performed and stored.


In [17]:
lca.perform_clustering(n_clusters=7)

Clustering performed with 7 clusters and stored.


In [18]:
lca.compute_and_order_cluster_risks()


Cluster labels updated based on T2D risks.
T2D risks for centroids and individual data points computed and stored.


(       Fasting_Blood_Glucose     HbA1c  Systolic_BP  Diastolic_BP       BMI  \
 0                  -0.063598 -0.238005     1.520150      1.803728 -1.188497   
 1                  -0.494320 -0.238005     0.838954      1.169468 -0.224750   
 2                  -0.278959 -0.238005     1.043313     -0.552093 -0.358604   
 3                  -0.135385 -0.544486     0.089638     -0.280267 -0.653082   
 4                   0.438910  0.221716    -0.387199      0.263384  0.150040   
 ...                      ...       ...          ...           ...       ...   
 19948              -0.135385 -0.238005    -0.455319     -0.733310 -0.760165   
 19949               0.079976 -0.084765     1.792629      1.803728 -0.840477   
 19950              -0.566107 -0.238005    -2.226429     -1.911220 -1.161726   
 19951              -0.925042 -0.238005    -1.204635     -0.914527 -0.144438   
 19952              -0.637894  0.068476    -0.046601     -0.189659  0.016186   
 
        Triglycerides  HDL_Cholesterol

In [19]:
# Get the cluster mapping
clustered_df = get_individual_cluster_mapping(lca)

In [20]:
# Display the first few rows
print(clustered_df.head())

        加入者id  Cluster_LIME_Ordered  Fasting_Blood_Glucose  HbA1c  \
0  M000000502                     5                   99.0    5.5   
1  M000000688                     1                   93.0    5.5   
2  M000000827                     2                   96.0    5.5   
3  M000001396                     1                   98.0    5.3   
4  M000001463                     6                  106.0    5.8   

   Systolic_BP  Diastolic_BP   BMI  Triglycerides  HDL_Cholesterol  \
0        149.0          99.0  19.6          327.0             62.0   
1        139.0          92.0  23.2           71.0             67.0   
2        142.0          73.0  22.7           34.0             74.0   
3        128.0          76.0  21.6          114.0             79.0   
4        121.0          82.0  24.6          479.0             30.0   

   LDL_Cholesterol  AST(GOT)  ALT(GPT)  Gamma_GTP       eGFR        Age  Sex  
0            103.0     107.0      78.0       29.0  97.354570  57.083333    1  
1     

In [21]:
# Save the clustered data to a CSV file for reference in the app
clustered_df.to_csv("cluster_mapping.csv", index=False)

AttributeError: 'LimeClusteringAnalysis' object has no attribute 'visualize_specified_diseases_stacked_over_clusters'

In [None]:
import pandas as pd

# Define the cluster labels and metabolic disease names
clusters = list(range(7))
metabolic_diseases = [
    'Angina pectoris', 'Cerebral infarction', 'Diabetic nephropathy', 'Diabetic retinopathy',
    'Hyperlipidemia', 'Hypertention', 'Hyperuricemia', 'Liver dysfunction', 
    'Myocardial infarction', 'Type 2 diabetes'
]

# Define the relative frequencies for each disease in each cluster
data = [
    [0.130283, 0.073699, 0.021656, 0.020957, 0.280126, 0.099546, 0.060077, 0.222145, 0.057981, 0.097800],
    [0.157262, 0.080173, 0.040395, 0.038853, 0.224175, 0.205365, 0.088807, 0.232192, 0.079556, 0.117175],
    [0.149978, 0.083953, 0.029952, 0.030826, 0.330127, 0.235899, 0.112156, 0.252514, 0.074115, 0.100568],
    [0.197802, 0.102157, 0.052503, 0.046398, 0.406593, 0.556777, 0.178673, 0.289784, 0.095645, 0.148148],
    [0.190981, 0.103006, 0.056145, 0.050840, 0.477896, 0.385942, 0.225464, 0.305482, 0.090186, 0.170203],
    [0.192355, 0.090423, 0.079737, 0.069051, 0.433621, 0.432799, 0.216605, 0.450062, 0.098233, 0.195643],
    [0.190948, 0.085337, 0.074493, 0.066007, 0.545969, 0.394625, 0.261198, 0.463932, 0.102782, 0.202735]
]

# Create the DataFrame
df_clusters_diseases = pd.DataFrame(data, index=clusters, columns=metabolic_diseases)
df_clusters_diseases

### **Analysis of Health Risks for Each Cluster Based on Metabolic Disease Prevalence**

The table provides the relative frequencies of ten metabolic diseases across seven ordered clusters. Since the clusters represent increasing risk profiles (from *Healthy* to *Severe Obesity*), we analyze how the prevalence of these diseases changes across clusters.

#### **1. Cluster-wise Health Risk Analysis**
- **Cluster 0 (Healthy)**  
  - This group has the lowest prevalence of Type 2 Diabetes (9.78%), hypertension (9.95%), and diabetic complications.
  - Hyperlipidemia is relatively common (28.01%), suggesting potential early-stage metabolic concerns.

- **Cluster 1 (Early Imbalance)**  
  - Moderate increases in all disease frequencies, particularly hypertension (20.54%), suggesting early cardiovascular risk.
  - Type 2 Diabetes risk increases slightly (11.72%).

- **Cluster 2 (Overweight Mild)**  
  - Higher prevalence of metabolic issues: hyperlipidemia (33.01%) and hypertension (23.59%).
  - Slightly higher rates of diabetic nephropathy and retinopathy than Cluster 1.

- **Cluster 3 (Hypertensive Liver)**  
  - A **major jump** in hypertension (55.67%), indicating severe blood pressure issues.
  - Hyperlipidemia reaches 40.66%, and Type 2 Diabetes is now at 14.81%.
  - Liver dysfunction is significantly higher (28.97%), suggesting fatty liver disease.

- **Cluster 4 (Obese Metabolic)**  
  - Hypertension decreases slightly (38.59%) but remains high.
  - Hyperlipidemia further increases (47.79%), showing worsening metabolic health.
  - Type 2 Diabetes prevalence reaches 17.02%, and diabetic complications become more frequent.

- **Cluster 5 (Severe Metabolic)**  
  - A further increase in **diabetic nephropathy (7.97%)** and **diabetic retinopathy (6.90%)**, suggesting more individuals with advanced diabetes.
  - Liver dysfunction surges (45.01%), indicating **high prevalence of fatty liver disease**.
  - Type 2 Diabetes reaches 19.56%, and hypertension remains high (43.27%).

- **Cluster 6 (Severe Obesity)**  
  - **The highest metabolic risk** group, with:
    - **54.60% hyperlipidemia**, **39.46% hypertension**, **20.27% Type 2 Diabetes**.
    - **26.12% hyperuricemia**, suggesting a high prevalence of gout and kidney-related metabolic issues.
    - **46.39% liver dysfunction**, possibly due to obesity-related liver disease.
  - **All diseases are at their highest prevalence**, indicating a **severe metabolic syndrome profile**.

---

### **2. Modifications to Personalized Health Advice Based on Cluster Assignment**

The LLM-generated medical advice should reflect these findings by providing **cluster-specific recommendations**.

#### **Revised LLM Prompt**
Modify the **personalized advice** section to include specific risk factors based on cluster assignment:

```python
openai.api_key = os.getenv("OPENAI_API_KEY")
if openai.api_key is None:
    st.error("OPENAI_API_KEY is not set in the environment!")
else:
    cluster_risks = {
        0: "Your metabolic health is currently in a good range, but maintaining a balanced diet and regular exercise will help sustain this condition.",
        1: "You have mild metabolic imbalances, especially in blood pressure and cholesterol. A focus on early lifestyle changes, such as improving diet quality and increasing physical activity, can prevent further risks.",
        2: "You show signs of metabolic stress, with elevated cholesterol and mild diabetic risk. Consider working on weight management and regular health monitoring to prevent progression.",
        3: "Hypertension and liver function issues are becoming significant. Reducing sodium intake, moderating alcohol consumption, and regular exercise are crucial for preventing cardiovascular complications.",
        4: "Obesity-related metabolic issues are evident, with increased risk of Type 2 Diabetes and heart disease. Prioritizing structured physical activity, fiber-rich diets, and weight management is necessary.",
        5: "Severe metabolic concerns, including liver dysfunction and diabetic complications, suggest a need for immediate intervention. Work closely with healthcare providers to manage blood sugar, liver health, and blood pressure.",
        6: "Your metabolic risk is at its highest, with very high chances of severe obesity-related complications. Intensive lifestyle changes and medical management are essential to prevent serious health outcomes."
    }

    user_risk_advice = cluster_risks[int(user_cluster)]

    prompt = f"""
    You are a medical expert specializing in diabetes prevention. A user has an estimated Type 2 Diabetes risk probability of {risk_probability:.2f}.
    They belong to Cluster {user_cluster} - **{user_cluster_name}**, which represents individuals with similar health characteristics.

    **Cluster {user_cluster} - {user_cluster_name} Profile:**
    - {user_risk_advice}

    Provide clear, concise, and evidence-based medical advice tailored to their risk level.
    Prioritize **practical** recommendations for **diet, exercise, and medical follow-ups**.
    """

    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )
    st.subheader("Personalized Health Advice")
    st.write(response.choices[0].message.content.strip())
```

---

### **Key Changes in the Advice Generation**
1. **Each cluster has a predefined risk message**, dynamically inserted into the LLM prompt.
2. The AI provides **tailored, cluster-specific** recommendations (e.g., for Cluster 6, more urgent lifestyle changes).
3. The structure ensures **practical** suggestions focused on **diet, exercise, and medical monitoring**.

This update **incorporates the disease prevalence data directly** into the AI-generated recommendations, making the advice **more personalized and actionable**.

### **🚀 Step-by-Step Guide to Deploy Your Streamlit App to Streamlit Cloud**  

Since you've **successfully tested your app locally**, the next step is to **deploy it on Streamlit Cloud** so others can access it online.

---

## **🔹 Step 1: Prepare Your Project for Deployment**
Before deploying, make sure your **app structure** is clean and well-organized.

✅ **Your current project directory:**  

```bash
T2D_Risk_App/
│── __pycache__/
│── app.py  # ✅ Main Streamlit app file
│── app_20250202.py  # (Optional, not needed for deployment)
│── cluster_mapping.csv  # ✅ Precomputed cluster mapping dataset
│── lime_clustering_analysis.py  # ✅ LIME clustering analysis script
│── logistic_regression.pkl  # ✅ Pretrained logistic regression model
│── predictive_clustering_with_diseases_20241226_ADA.jpg  # ✅ Image for UI
│── requirements.txt  # ✅ Dependency list (needed for deployment)
│── rf_classifier.pkl  # (Optional, not needed for logistic regression)
│── scaler.pkl  # ✅ StandardScaler instance for data normalization
│── t2d_risk_app.ipynb  # (Optional, not needed for deployment)
```

---

## **🔹 Step 2: Upload Your Project to GitHub**
Streamlit Cloud requires your app to be in a **GitHub repository**.

### ✅ **Step 2.1: Create a GitHub Repository**
1. Go to [GitHub](https://github.com/) and **log in**.
2. Click the **"New Repository"** button.
3. Name your repository (e.g., **T2D_Risk_App**).
4. Set the repository to **public** (or private if you have a Streamlit Cloud subscription).
5. **Do NOT** initialize with a README, `.gitignore`, or license (we’ll do this later).

### ✅ **Step 2.2: Push Your Code to GitHub**
1. Open **Terminal** (Mac/Linux) or **Git Bash** (Windows) in your project folder.

```bash
cd /path/to/T2D_Risk_App
```

2. Initialize Git and link your repository:

```bash
git init
git add .
git commit -m "First commit - Streamlit T2D Risk App"
git branch -M main
git remote add origin https://github.com/YOUR_GITHUB_USERNAME/T2D_Risk_App.git
git push -u origin main
```

> **Replace** `YOUR_GITHUB_USERNAME` with your actual GitHub username.

---

## **🔹 Step 3: Deploy on Streamlit Cloud**
1. Go to **[Streamlit Cloud](https://share.streamlit.io/)** and **log in**.
2. Click **"New App"**.
3. **Select your GitHub repository** (`T2D_Risk_App`).
4. **Branch:** `main`
5. **App File Path:** `app.py`
6. Click **"Deploy"**.

---

## **🔹 Step 4: Define Dependencies**
Streamlit Cloud installs the packages listed in `requirements.txt`.

### ✅ **Check your `requirements.txt` file**  
Make sure it includes **all required libraries**. If missing, create one by running:

```bash
pip freeze > requirements.txt
```

Then, open `requirements.txt` and **ensure it includes**:

```
streamlit
pandas
numpy
scipy
scikit-learn
pillow
openai
```

> **⚠️ Important:**  
If `openai` is used, you need to **set an API key** in Streamlit secrets.

---

## **🔹 Step 5: Set Up Secrets for OpenAI API**
Streamlit Cloud doesn’t allow direct use of `.env` files for API keys. Instead:

1. **Go to** [Streamlit Cloud Secrets](https://share.streamlit.io/)
2. Click on your deployed app → **Settings** → **Secrets**
3. Add the following:

```
OPENAI_API_KEY = "your-openai-api-key"
```

---

## **🔹 Step 6: Restart & Test the App**
1. **After deployment,** Streamlit Cloud will automatically install dependencies and run your app.
2. If errors occur, **check logs** under "Manage App" → "Logs".
3. If needed, **restart the app** after fixing issues.

---

## **🔹 Step 7: Share Your App 🎉**
Once successfully deployed, you’ll get a **public URL** like:

```
https://your-username-t2d-risk-app.streamlit.app
```

Now you can **share the link** with others to test the app!

---

## **🔹 Common Deployment Issues & Fixes**
| **Issue** | **Fix** |
|-----------|--------|
| `ModuleNotFoundError` | Ensure `requirements.txt` has all dependencies. |
| `OSError: No file found` | Check if `cluster_mapping.csv` and `.pkl` files are in the repo. |
| `OpenAI API key missing` | Add it under Streamlit Cloud "Secrets". |
| `App crashes on load` | Check logs for missing files or library mismatches. |

---

## **🔥 Next Steps**
✅ **Monitor App Usage** on Streamlit Cloud.  
✅ **Enhance Performance** (optimize data loading, caching).  
✅ **Expand Functionality** (e.g., real-time user feedback).  

---

That’s it! 🎉 Your **T2D Risk App** is now online! 🚀

In [23]:
# Read and chech the cluster mapping
df = pd.read_csv("cluster_mapping.csv", encoding="utf-8")
print(df.head())
print(df.columns)

        加入者id  Cluster_LIME_Ordered  Fasting_Blood_Glucose  HbA1c  \
0  M000000502                     5                   99.0    5.5   
1  M000000688                     1                   93.0    5.5   
2  M000000827                     2                   96.0    5.5   
3  M000001396                     1                   98.0    5.3   
4  M000001463                     6                  106.0    5.8   

   Systolic_BP  Diastolic_BP   BMI  Triglycerides  HDL_Cholesterol  \
0        149.0          99.0  19.6          327.0             62.0   
1        139.0          92.0  23.2           71.0             67.0   
2        142.0          73.0  22.7           34.0             74.0   
3        128.0          76.0  21.6          114.0             79.0   
4        121.0          82.0  24.6          479.0             30.0   

   LDL_Cholesterol  AST(GOT)  ALT(GPT)  Gamma_GTP       eGFR        Age  Sex  
0            103.0     107.0      78.0       29.0  97.354570  57.083333    1  
1     

In [None]:
with st.sidebar.expander("🌐 Language", expanded=True):
    if st.button("English"):
        st.session_state['language'] = 'English'
    if st.button("日本語"):
        st.session_state['language'] = 'Japanese'
    if st.button("中文"):
        st.session_state['language'] = 'Chinese'

# Stop execution until language is selected
if st.session_state['language'] is None:
    st.stop()

# Now safely assign lang
lang = st.session_state['language']