<a href="https://colab.research.google.com/github/LorraineWong/WQD7005-Data-Mining-S2152880/blob/main/WQD7005_Individual_Assignment_S2152880.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **📦 0. Setup and Imports**

In [5]:
# 0. Repository in Google Colab from GitHub
!git clone https://github.com/LorraineWong/WQD7005-Data-Mining-S2152880.git

Cloning into 'WQD7005-Data-Mining-S2152880'...
fatal: could not read Username for 'https://github.com': No such device or address


In [1]:
# 1. Install dependencies & configure
!pip install -q openai numpy pandas

In [2]:
# Imports & AzureOpenAI client configuration
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from openai import AzureOpenAI
import random
import time

# Azure endpoint and key
endpoint = "https://ai-s21528803592ai665634527840.openai.azure.com/"
model_name = "gpt-4o"
deployment = "gpt-4o"

subscription_key = "6IvEyqI2Yk1OBSBSE4dTAV3n2XXZcP66x3DVRY4lxFVvKpOiFYISJQQJ99BDACHYHv6XJ3w3AAAAACOG8Vsb"
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)
def model_prompt(prompt, system_prompt="Act as a professional clinicians.", temperature=0.7, max_tokens=4096):
    response = client.chat.completions.create(
        model=deployment,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=temperature,
    )
    return response.choices[0].message.content

data_prompt = """
Generate a single, realistic patient monitoring record for one randomly selected adult patient.

Provide the following fields:
- oxygen_saturation (in %)
- heart_rate (in bpm)
- temperature (in °C)
- blood_pressure (systolic/diastolic, e.g. "120/80")
- weight (in kg)
- blood_glucose (in mg/dL)

At the end, include a brief clinical_note (1–2 sentences, max 30 words) summarizing the patient status based on the values above. Use professional clinical tone with realistic variation (e.g. stable, recovering, mild concerns).

Output as a valid JSON object with keys:
oxygen_saturation, heart_rate, temperature, blood_pressure, weight, blood_glucose, clinical_note.

Constraints:
- Only output one JSON object.
- No markdown or explanation.
- Include realistic variation across different health conditions (e.g. fatigue, post-op, dietary changes, stress).
- Ensure all fields are complete, no missing values.
"""

print(model_prompt(data_prompt))

{
  "oxygen_saturation": 94,
  "heart_rate": 88,
  "temperature": 37.5,
  "blood_pressure": "130/85",
  "weight": 78,
  "blood_glucose": 145,
  "clinical_note": "Patient shows mild hypertension and elevated blood glucose, potentially linked to recent stress or dietary changes; oxygen saturation and temperature remain within acceptable limits."
}


# **🧩 1. Dataset Simulation using GenAI**

In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
import random
import time
import re

num_patients = 500
days = 30
start_date = datetime(2025, 1, 1)
dates = [start_date + timedelta(days=i) for i in range(days)]
records = []

def parse_json_from_prompt(raw_output):
    clean_text = re.sub(r"```json|```", "", raw_output).strip()
    return json.loads(clean_text)

def randomly_insert_missing(data_dict, missing_prob=0.05):
    for key in ["oxygen_saturation", "heart_rate", "temperature", "blood_pressure", "weight", "blood_glucose"]:
        if random.random() < missing_prob:
            data_dict[key] = np.nan
    return data_dict

# GPT prompt templates
base_prompt = """
Generate a single, realistic patient monitoring record for one randomly selected adult patient.

Provide the following fields:
- oxygen_saturation (in %)
- heart_rate (in bpm)
- temperature (in °C)
- blood_pressure (systolic/diastolic, e.g. "120/80")
- weight (in kg)
- blood_glucose (in mg/dL)

At the end, include a brief clinical_note (1–2 sentences, max 30 words) summarizing the patient status based on the values above. Use professional clinical tone with realistic variation (e.g. stable, recovering, mild concerns).

Output as a valid JSON object with keys:
oxygen_saturation, heart_rate, temperature, blood_pressure, weight, blood_glucose, clinical_note.

Constraints:
- Only output one JSON object.
- No markdown or explanation.
- Include realistic variation across different health conditions (e.g. fatigue, post-op, dietary changes, stress).
- Ensure all fields are complete, no missing values.
"""

def generate_variation_prompt(prev_data):
    return f"""
Given the previous patient data:
- oxygen_saturation: {prev_data['oxygen_saturation']}
- heart_rate: {prev_data['heart_rate']}
- temperature: {prev_data['temperature']}
- blood_pressure: {prev_data['blood_pressure']}
- weight: {prev_data['weight']}
- blood_glucose: {prev_data['blood_glucose']}

Simulate a slightly varied monitoring record for the following day. Output the new record in JSON format, and include a brief clinical_note reflecting changes or stability.

JSON format keys: oxygen_saturation, heart_rate, temperature, blood_pressure, weight, blood_glucose, clinical_note.
Only return the JSON object, no markdown or formatting.
"""

# Loop and generate via GenAI
for pid in range(1, num_patients + 1):
    patient_id = f"P{pid:04d}"
    prev_day_data = None

    for i, date in enumerate(dates):
        if i == 0:
            prompt = base_prompt
        else:
            prompt = generate_variation_prompt(prev_day_data)

        raw_output = model_prompt(prompt)
        if raw_output:
            try:
                data = parse_json_from_prompt(raw_output)
                prev_day_data = data.copy()
                data = randomly_insert_missing(data, missing_prob=0.05)

                records.append({
                    "patient_id": patient_id,
                    "timestamp": date.strftime("%Y-%m-%d"),
                    "oxygen_saturation": data.get("oxygen_saturation", np.nan),
                    "heart_rate": data.get("heart_rate", np.nan),
                    "temperature": data.get("temperature", np.nan),
                    "blood_pressure": data.get("blood_pressure", np.nan),
                    "weight": data.get("weight", np.nan),
                    "blood_glucose": data.get("blood_glucose", np.nan),
                    "clinical_note": data.get("clinical_note", "")
                })

            except json.JSONDecodeError as err:
                print(f"JSON parsing error for patient {patient_id} on {date}: {err}")
        time.sleep(0.22)

# Convert and save
df = pd.DataFrame(records)
df.to_csv("/content/drive/MyDrive/UM Data Science Course Information/WQD7005/genai_patient_dataset.csv", index=False)
df.head()

Unnamed: 0,patient_id,timestamp,oxygen_saturation,heart_rate,temperature,blood_pressure,weight,blood_glucose,clinical_note
0,P0001,2024-01-01,95,88,37.2,130/85,72,110,Patient exhibits mild hypertension and elevate...
1,P0001,2024-01-02,96,90,37.1,128/84,72,112,Vital signs remain stable with minor variation...
2,P0002,2024-01-01,95,88,37.6,140/90,82,160,Patient shows elevated blood pressure and mode...
3,P0002,2024-01-02,94,90,37.5,138/88,82,158,The patient shows stable vital signs with mino...


In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
import re
import time

num_patients = 2
days = 2
start_date = datetime(2024, 1, 1)
dates = [start_date + timedelta(days=i) for i in range(days)]
records = []

def parse_json_from_prompt(raw_output):
    clean_text = re.sub(r"```json|```", "", raw_output).strip()
    return json.loads(clean_text)

def clean_missing_values(data_dict):
    for k, v in data_dict.items():
        if isinstance(v, str) and v.strip().lower() in ["na", "n/a", "null", "none", ""]:
            data_dict[k] = np.nan
    return data_dict

# GenAI prompt templates

base_prompt = """
Generate a single, realistic patient monitoring record for one randomly selected adult patient.

Provide the following fields (some may be missing due to sensor failure or unrecorded data):
- oxygen_saturation (in %)
- heart_rate (in bpm)
- temperature (in °C)
- blood_pressure (systolic/diastolic, e.g. "120/80")
- weight (in kg)
- blood_glucose (in mg/dL)

At the end, include a brief clinical_note (1–2 sentences, max 30 words) summarizing the patient status based on available values. Mention any missing fields if relevant.

Output as a JSON object with keys:
oxygen_saturation, heart_rate, temperature, blood_pressure, weight, blood_glucose, clinical_note.

Constraints:
- Allow up to 2 fields to be missing per record (use null or "NA" as value).
- Do not include markdown or explanations.
- Ensure values are medically realistic.
"""

def generate_variation_prompt(prev_data):
    return f"""
Given the previous patient data:
- oxygen_saturation: {prev_data['oxygen_saturation']}
- heart_rate: {prev_data['heart_rate']}
- temperature: {prev_data['temperature']}
- blood_pressure: {prev_data['blood_pressure']}
- weight: {prev_data['weight']}
- blood_glucose: {prev_data['blood_glucose']}

Generate a slightly varied record for the next day. Some fields may be missing due to real-world issues (e.g. sensor error, not recorded). Return a JSON object with keys:
oxygen_saturation, heart_rate, temperature, blood_pressure, weight, blood_glucose, clinical_note.

Use null or "NA" for missing values. No markdown or extra explanation.
"""

# Main loop
for pid in range(1, num_patients + 1):
    patient_id = f"P{pid:04d}"
    prev_day_data = None

    for i, date in enumerate(dates):
        prompt = base_prompt if i == 0 else generate_variation_prompt(prev_day_data)
        raw_output = model_prompt(prompt)

        if raw_output:
            try:
                data = parse_json_from_prompt(raw_output)
                data = clean_missing_values(data)
                prev_day_data = data.copy()

                records.append({
                    "patient_id": patient_id,
                    "timestamp": date.strftime("%Y-%m-%d"),
                    "oxygen_saturation": data.get("oxygen_saturation", np.nan),
                    "heart_rate": data.get("heart_rate", np.nan),
                    "temperature": data.get("temperature", np.nan),
                    "blood_pressure": data.get("blood_pressure", np.nan),
                    "weight": data.get("weight", np.nan),
                    "blood_glucose": data.get("blood_glucose", np.nan),
                    "clinical_note": data.get("clinical_note", "")
                })
            except json.JSONDecodeError as err:
                print(f"JSON parsing error for patient {patient_id} on {date}: {err}")

        time.sleep(0.22)

# Save as DataFrame
df = pd.DataFrame(records)
df.to_csv("simulated_patient_dataset.csv", index=False)
df.head()

Unnamed: 0,patient_id,timestamp,oxygen_saturation,heart_rate,temperature,blood_pressure,weight,blood_glucose,clinical_note
0,P0001,2024-01-01,95,78,37.2,122/76,72.5,,Patient is stable with normal vitals. Blood gl...
1,P0001,2024-01-02,94,80,,120/74,72.6,,Slight variation in heart rate and oxygen satu...
2,P0002,2024-01-01,95,82,37.2,128/85,72.0,,Patient is stable with normal vitals. Blood gl...
3,P0002,2024-01-02,94,85,,130/87,72.5,,Slight increase in heart rate and blood pressu...


# **📊 2. EDA with LLM Insights**

In [None]:
# Plot function and encode for GPT
def plot_and_encode(fig):
    buf = io.BytesIO()
    fig.savefig(buf, format="png", bbox_inches='tight')
    buf.seek(0)
    encoded = base64.b64encode(buf.read()).decode("utf-8")
    plt.close(fig)
    return encoded



# **🧼 3. Advanced Data Preprocessing with SLMs / LLMs**

# **📝 4. AI-Assisted Summary Report and Visualization**