In [None]:
# !pip install numpy --upgrade
# !pip install torch --force-reinstall
# !pip install -U transformers

In [None]:
# Dataset Simulation and Feature Engineering

In [None]:
# Import all the necessary libraries
!pip install sdv
import numpy as np
import pandas as pd
import torch
import random
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

# Then, to ensure reproducibility in the synthetic data, seeding was done
seed = 1
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)
torch.use_deterministic_algorithms(True)

# Then, load the small sample data that was previously generated on GPT
sampledata = pd.read_csv("synthetic_patient_data.csv")
print(sampledata.info)

Collecting sdv
  Downloading sdv-1.23.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.38.41-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.38.41-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.17.0 (from sdv)
  Downloading rdt-1.17.0-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.21.0-py3-none-any.whl.metadata (9.4 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

In [None]:
# Next, to generate a sample size of 15000 samples, the metadata was first defined
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(sampledata)

# Finally, CTGAN was trained and used to generate synthetic data
model = CTGANSynthesizer(metadata)
model.fit(sampledata)

synthetic_data = model.sample(15000)
print(synthetic_data.head())



   patient_id            timestamp  oxygen_saturation  heart_rate  \
0           3  2023-03-12T22:30:26               96.3        64.0   
1           4  2023-03-16T04:19:44                NaN        80.0   
2           5  2023-03-09T02:18:51               98.1        84.0   
3           1  2023-03-28T22:34:04                NaN        78.0   
4           2  2023-03-28T01:35:39              100.1        94.0   

   temperature  blood_pressure_systolic  blood_pressure_diastolic  weight  \
0         35.9                    159.0                      72.0    64.0   
1         36.5                    140.0                      85.0    80.0   
2         38.6                    132.0                      77.0    80.0   
3         36.4                    140.0                      72.0    77.0   
4         38.3                    147.0                      87.0    74.2   

   blood_glucose questionnaire_response  \
0           59.0                    NaN   
1          141.0  No symptoms report

In [None]:
# Apply LLMs to extract meaningful features from simulated textual data (clinical notes, health records)
# Firstly, install the needed libraries
!pip install -U sentence-transformers transformers accelerate

from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np
import pandas as pd
import json, tqdm

Collecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.8.1-py3-none-any.whl (365 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.7.0
    Uninstalling accelerate-1.7.0:
      Successfully uninstalled accelerate-1.7.0
Successfully installed accelerate-1.8.1


In [None]:
# Then, embedding was done towards the text columns
# Firstly, a biomedical encoder was selected
emb_model = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")

text_cols = ["clinical_note"]

# The columns were then concatenated
combined_text = synthetic_data[text_cols].fillna("").agg(" ".join, axis=1).tolist()
embeddings = emb_model.encode(combined_text, batch_size=64, show_progress_bar=True)

# The columns were then added as numeric columns
emb_arr = np.vstack(embeddings)
for i in range(emb_arr.shape[1]):
    synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/461k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/235 [00:00<?, ?it/s]

  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]
  synthetic_data[f"text_emb_{i}"] = emb_arr[:, i]


In [None]:
# Finally, the features were saved for modelling later
synthetic_data.to_parquet("synthetic_patient_with_text_features.parquet", index=False)
print("Final shape:", synthetic_data.shape)


Final shape: (15000, 779)


In [None]:
# Predictive Model Development

In [None]:
# Firstly, the environment was set up
!pip install scikit-learn xgboost torch transformers sentence-transformers rtdl lightning --quiet

import pandas as pd
from sklearn.model_selection import train_test_split

import torch
print(torch.__version__)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.2/53.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.4/887.4 MB[0m [31m655.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Traditional Models
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

# Then, load the data with embedded text features
DATA_PATH = "synthetic_patient_with_text_features.parquet"

data = pd.read_parquet(DATA_PATH)
print(f"Loaded data shape: {data.shape}")

# Then, prepare the features and target
text_emb_cols = [c for c in data.columns if c.startswith("text_emb_")]
numeric_cols = ["oxygen_saturation", "heart_rate", "temperature", "blood_pressure_systolic", "blood_pressure_diastolic", "weight", "blood_glucose"]

# Binary classification label
label_col = "questionnaire_response"

data[label_col] = data[label_col].fillna("")

data[label_col] = data[label_col].apply(
    lambda x: "reported" if "reported" in x.lower() else "none"
)

y = data[label_col].map({"none": 0, "reported": 1}).values

# Then, missing numeric values were filled
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

X_struct = data[numeric_cols].values
X_text   = data[text_emb_cols].values

X = np.hstack((X_struct, X_text))

# Then, training and testing data was split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y
)

# Finally, the models were trained
# Random Forest
print("\nTraining Random Forest …")
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")
rf.fit(X_train, y_train)

# XGBoost
print("Training XGBoost …")
xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
)
xgb.fit(X_train, y_train)

# Neural Network
print("Training Neural Net (MLP) …")
mlp = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=400, random_state=42)
mlp.fit(X_train, y_train)

# Evaluation
print("\n================  Evaluation  ================")
for name, model in zip(
    ["Random Forest", "XGBoost", "Neural Net"], [rf, xgb, mlp]
):
    print(f"\n{name} results:")
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))

Loaded data shape: (15000, 779)

Training Random Forest …
Training XGBoost …
Training Neural Net (MLP) …


Random Forest results:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90      2476
           1       0.00      0.00      0.00       524

    accuracy                           0.82      3000
   macro avg       0.41      0.50      0.45      3000
weighted avg       0.68      0.82      0.75      3000


XGBoost results:
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      2476
           1       0.28      0.03      0.05       524

    accuracy                           0.82      3000
   macro avg       0.56      0.51      0.48      3000
weighted avg       0.73      0.82      0.75      3000


Neural Net results:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90      2476
           1       0.00      0.00      0.00       524

    accuracy 

In [None]:
# Advanced Transformer-based model: DistilBERT
# Firstly, import the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# The dataset was then loaded
df = pd.read_parquet("synthetic_patient_with_text_features.parquet")
df["label"] = df["questionnaire_response"].fillna("").apply(
    lambda x: 1 if "reported" in x.lower() else 0
)
df["text"] = df["clinical_note"].fillna("")

train_df, test_df = train_test_split(df[["text", "label"]], test_size=0.2, stratify=df["label"], random_state=42)
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

# Then, load the model
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True)

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.map(tokenize_fn, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir="./distilbert_results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="no",
    logging_steps=10,
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Compute metrics were defined
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "f1": report["weighted avg"]["f1-score"]
    }

# The data was trained
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

# 14. Final evaluation
preds = trainer.predict(test_ds)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

print("\n=== Final Evaluation Report ===")
print(classification_report(y_true, y_pred))

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.5887
20,0.4914
30,0.5698
40,0.4571
50,0.4666
60,0.4256
70,0.4882
80,0.549
90,0.4148
100,0.5298



=== Final Evaluation Report ===
              precision    recall  f1-score   support

           0       0.83      1.00      0.90      2476
           1       0.00      0.00      0.00       524

    accuracy                           0.83      3000
   macro avg       0.41      0.50      0.45      3000
weighted avg       0.68      0.83      0.75      3000



In [None]:
# Sentiment analysis using DistilBERT
from transformers import pipeline

# Firstly, prepare the text column
texts = df["questionnaire_response"].fillna("").tolist()

# Then, load the model
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Then, perform sentiment analysis to the text
sentiment_results = sentiment_pipeline(texts, truncation=True, batch_size=32)

# Append the results into the dataframe
df["sentiment_label"] = [res["label"] for res in sentiment_results]
df["sentiment_score"] = [res["score"] for res in sentiment_results]

# Results
print(df[["questionnaire_response", "sentiment_label", "sentiment_score"]].head())

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


  questionnaire_response sentiment_label  sentiment_score
0                   None        POSITIVE         0.748121
1  No symptoms reported.        NEGATIVE         0.982345
2  No symptoms reported.        NEGATIVE         0.982345
3    Feeling okay today.        POSITIVE         0.999835
4  No symptoms reported.        NEGATIVE         0.982345


In [None]:
# Model Evaluation and Interpretation

In [None]:
# Before the outputs can be interpreted, the metrics need to be captured first
!pip install google-generativeai --quiet
import google.generativeai as genai
from sklearn.metrics import classification_report

genai.configure(api_key="AIzaSyDRQpJf1rkQSXfDo0UjyxqTkKl5MPE0N78")

def extract_key_metrics(report_str):
    lines = report_str.strip().split('\n')
    key_lines = [line for line in lines if 'accuracy' in line.lower() or 'weighted avg' in line.lower()]
    return '\n'.join(key_lines)

rf_rep = extract_key_metrics(classification_report(y_test, rf.predict(X_test)))
xgb_rep = extract_key_metrics(classification_report(y_test, xgb.predict(X_test)))
mlp_rep = extract_key_metrics(classification_report(y_test, mlp.predict(X_test)))
distilbert_rep = extract_key_metrics(classification_report(y_true, y_pred))

# Then, a prompt was designed
prompt = f"""
You are a medical ML expert. Evaluate and rank the following models based on their accuracy and F1-score performance.
Comment on each model's strength and weaknesses, especially in detecting underrepresented classes.
Suggest 2 specific improvements to boost performance, particularly for class imbalance.

### Model 1: Random Forest
{rf_rep}

### Model 2: XGBoost
{xgb_rep}

### Model 3: Neural Net (MLP)
{mlp_rep}

### Model 4: DistilBERT
{distilbert_rep}

Respond with a ranked summary, with reasoning for each choice.
"""

# Then, FLAN-T5-base was loaded
model = genai.GenerativeModel(model_name="models/gemini-2.5-flash")
response = model.generate_content(prompt)

print("\n🧠 Final Interpretation:\n")
print(response.text)


🧠 Final Interpretation:

As a medical ML expert, the provided performance metrics (accuracy and weighted average F1-score) immediately highlight a critical concern: **significant class imbalance**. All models exhibit an accuracy much higher than their weighted average F1-score (0.82-0.83 vs. 0.75). This discrepancy strongly suggests that the models are performing well on the majority class(es) (driving up accuracy) but poorly on the minority, or "underrepresented," class(es) (driving down the weighted average F1-score, which is more sensitive to performance on all classes).

In medical contexts, underrepresented classes often correspond to rare but critical conditions, specific disease subtypes, or adverse events. Failing to detect these can have severe clinical consequences. Therefore, while overall accuracy is important, performance on minority classes (reflected in F1-score and especially recall/precision for those classes, which are not provided here but are the components of F1) 

In [None]:
for m in genai.list_models():
    print(m.name)

models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-2.5-pro-exp-03-25
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-04-17
models/gemini-2.5-flash-preview-05-20
models/gemini-2.5-flash
models/gemini-2.5-flash-preview-04-17-thinking
models/gemini-2.5-flash-lite-preview-06-17
models/gemini-2.5-pro-preview-05-06
models/gemini-2.5-pro-preview-06-05
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-preview-image-generation
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-