In [2]:
!pip -q install "sentence-transformers>=3.0" "scikit-learn>=1.3" "pandas" "numpy"

# Import dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = "/content/Symptom2Disease.csv"


import numpy as np, pandas as pd, torch, random, gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score, log_loss, roc_auc_score

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

df = pd.read_csv(DATA_PATH).dropna(subset=["text","label"])[["text","label"]].reset_index(drop=True)
labels = sorted(df["label"].unique().tolist())
label2id = {y:i for i,y in enumerate(labels)}
id2label = {i:y for y,i in label2id.items()}
y_all = df["label"].map(label2id).values
X_text = df["text"].astype(str).tolist()
print(f"Rows={len(df)} | Classes={len(labels)} | Example:\n", df.head(2))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Rows=1200 | Classes=24 | Example:
                                                 text      label
0  I have been experiencing a skin rash on my arm...  Psoriasis
1  My skin has been peeling, especially on my kne...  Psoriasis


In [5]:
def macro_metrics(y_true, y_prob, k=3):
    y_pred = y_prob.argmax(1)
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "macro_f1": f1_score(y_true, y_pred, average='macro'),
        "micro_f1": f1_score(y_true, y_pred, average='micro'),
        f"top{k}_acc": top_k_accuracy_score(y_true, y_prob, k=k, labels=list(range(len(labels)))),
        "nll": log_loss(y_true, y_prob, labels=list(range(len(labels)))),
    }


In [None]:
from sentence_transformers import SentenceTransformer
SBERT_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # fast & strong general-purpose SBERT
sbert = SentenceTransformer(SBERT_NAME)
X_emb = sbert.encode(X_text, batch_size=128, convert_to_numpy=True, normalize_embeddings=False, show_progress_bar=True)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
fold_rows = []
oof_prob = np.zeros((len(df), len(labels)), dtype=np.float32)

for fold, (tr, va) in enumerate(skf.split(X_emb, y_all)):
    Xtr, Xva = X_emb[tr], X_emb[va]
    ytr, yva = y_all[tr], y_all[va]

    knn = KNeighborsClassifier(n_neighbors=5, metric="cosine")
    knn.fit(Xtr, ytr)
    p_knn = knn.predict_proba(Xva)

    scaler = StandardScaler()   
    Xtr_s = scaler.fit_transform(Xtr)
    Xva_s = scaler.transform(Xva)

    base_lr = LogisticRegression(
        multi_class="multinomial", solver="lbfgs", C=1.0, max_iter=3000, n_jobs=-1, random_state=SEED
    )
    lr = CalibratedClassifierCV(base_lr, method="sigmoid", cv=3)
    lr.fit(Xtr_s, ytr)
    p_lr = lr.predict_proba(Xva_s)

    
    p = 0.5 * p_knn + 0.5 * p_lr

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]



In [None]:
    oof_prob[va] = p
    m = macro_metrics(yva, p, k=3)
    m.update({"fold": fold})
    fold_rows.append(m)
    print(f"[Fold {fold}] acc={m['accuracy']:.3f}  macroF1={m['macro_f1']:.3f}  microF1={m['micro_f1']:.3f}  top3={m['top3_acc']:.3f}  NLL={m['nll']:.3f}")

    
    del Xtr_s, Xva_s, base_lr, lr, knn; gc.collect()

cv_df = pd.DataFrame(fold_rows)
print("\nCV means:\n", cv_df.mean(numeric_only=True).to_frame("mean"))

[Fold 4] acc=0.967  macroF1=0.967  microF1=0.967  top3=0.996  NLL=0.227

CV means:
               mean
accuracy  0.966667
macro_f1  0.966613
micro_f1  0.966667
top3_acc  0.995833
nll       0.226691
fold      4.000000


In [8]:
y_true = y_all
summary = macro_metrics(y_true, oof_prob, k=3)
try:
    summary["auc_ovo"] = roc_auc_score(y_true, oof_prob, multi_class="ovo")
except Exception as e:
    summary["auc_ovo"] = np.nan
summary_df = pd.DataFrame([summary])
summary_df



Unnamed: 0,accuracy,macro_f1,micro_f1,top3_acc,nll,auc_ovo
0,0.226667,0.312232,0.226667,0.299167,12.799246,


In [None]:

HIGH_RISK = ["Pneumonia","Dengue","Typhoid","Hypertension","diabetes"]  
id_map = {y:i for i,y in enumerate(labels)}
hr_ids = [id_map[x] for x in HIGH_RISK if x in id_map]
y_pred = oof_prob.argmax(1)
y_true_hr = np.isin(y_true, hr_ids).astype(int)
y_pred_hr = np.isin(y_pred, hr_ids).astype(int)

from sklearn.metrics import f1_score, precision_score, recall_score
hr_metrics = {
    "F1_highrisk": f1_score(y_true_hr, y_pred_hr) if len(np.unique(y_true_hr))==2 else np.nan,
    "Recall_highrisk": recall_score(y_true_hr, y_pred_hr) if len(np.unique(y_true_hr))==2 else np.nan,
    "Precision_highrisk": precision_score(y_true_hr, y_pred_hr) if len(np.unique(y_true_hr))==2 else np.nan,
}
pd.DataFrame([hr_metrics])


Unnamed: 0,F1_highrisk,Recall_highrisk,Precision_highrisk
0,0.32107,0.192,0.979592


In [None]:

rng = np.random.default_rng(SEED)
sample_ids = rng.choice(len(df), size=5, replace=False)

from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(X_emb[sample_ids], X_emb)  
for i, idx in enumerate(sample_ids, start=1):
    topk = np.argsort(-sim[i-1])[:6]  
    print(f"\nCase {i} | label={df.loc[idx,'label']} | text={df.loc[idx,'text'][:120]}...")
    for j, nb in enumerate(topk):
        print(f"  #{j}: id={nb:4d}  sim={sim[i-1, nb]:.3f}  label={df.loc[nb,'label']}")



Case 1 | label=urinary tract infection | text=I have to use the restroom frequently to relieve myself, but I can't seem to get my bladder empty. I occasionally get se...
  #0: id= 926  sim=1.000  label=urinary tract infection
  #1: id= 938  sim=0.913  label=urinary tract infection
  #2: id= 912  sim=0.878  label=urinary tract infection
  #3: id= 928  sim=0.750  label=urinary tract infection
  #4: id= 914  sim=0.725  label=urinary tract infection
  #5: id= 940  sim=0.697  label=urinary tract infection

Case 2 | label=Arthritis | text=I've been experiencing stiffness and weakness in my neck muscles recently. Since my joints have grown, it's hard for me ...
  #0: id= 526  sim=1.000  label=Arthritis
  #1: id= 524  sim=1.000  label=Arthritis
  #2: id= 512  sim=0.958  label=Arthritis
  #3: id= 511  sim=0.954  label=Arthritis
  #4: id= 542  sim=0.933  label=Arthritis
  #5: id= 516  sim=0.932  label=Arthritis

Case 3 | label=Cervical spondylosis | text=I have been struggling with intense back

In [None]:
import ipywidgets as w
from IPython.display import display, HTML
from sklearn.metrics.pairwise import cosine_similarity


def predict_all(text, k=5):
    # Convert input text to embedding using the trained SentenceTransformer model
    text_emb = sbert.encode([text], convert_to_numpy=True, normalize_embeddings=False)

    sim = cosine_similarity(text_emb, X_emb)[0]

    topk_indices = np.argsort(-sim)[:k]
    topk_labels = [df.loc[i, 'label'] for i in topk_indices]
    topk_sims = [sim[i] for i in topk_indices]

    label_probs = {}
    for label, similarity in zip(topk_labels, topk_sims):
        label_probs[label] = label_probs.get(label, 0) + similarity

    total_sim = sum(label_probs.values())
    if total_sim > 0:
        label_probs = {label: prob / total_sim for label, prob in label_probs.items()}

    ranked_labels = sorted(label_probs.items(), key=lambda item: item[1], reverse=True)

    return ranked_labels

symptoms = w.Textarea(
    value="",
    placeholder="e.g., high fever, severe headache, muscle/joint pain, rash",
    description="Symptoms:",
    layout=w.Layout(width="100%", height="110px")
)
btn_predict = w.Button(description="Predict", button_style="primary")
btn_clear = w.Button(description="Clear")
out = w.Output()

def on_predict(_):
    with out:
        out.clear_output()
        text = (symptoms.value or "").strip()
        if not text:
            display(HTML("<b>Please type symptoms.</b>"))
            return
        ranked = predict_all(text)            
        top_label, top_prob = ranked[0]
        display(HTML(
            f"<h3>Predicted disease: {top_label}</h3>"
            f"<p><b>Probability:</b> {top_prob:.2%}</p>"
            "<p style='color:#666'>Research prototype — not medical advice.</p>"
        ))
        if len(ranked) > 1:
            display(HTML("<h4>Other potential diagnoses:</h4>"))
            for label, prob in ranked[1:]:
                display(HTML(f"<p>{label}: {prob:.2%}</p>"))


def on_clear(_):
    with out:
        out.clear_output()
    symptoms.value = ""

btn_predict.on_click(on_predict)
btn_clear.on_click(on_clear)

display(w.VBox([
    w.HTML("<h2>🩺 Symptom → Disease</h2>"),
    symptoms,
    w.HBox([btn_predict, btn_clear]),
    out
]))

VBox(children=(HTML(value='<h2>🩺 Symptom → Disease</h2>'), Textarea(value='', description='Symptoms:', layout=…

**Note:** The output of the prediction will appear below the "Predict" and "Clear" buttons after you enter symptoms and click "Predict".