In [1]:
# 📌 Install Required Libraries (if not already installed)
!pip install --quiet transformers datasets scikit-learn

# 📌 Check GPU availability
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ Using device:", device)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# Load Symptom2Disease CSV into a pandas DataFrame
import pandas as pd
data_path = "/content/Symptom2Disease.csv"
df = pd.read_csv(data_path)
print(df.shape)
print(df.head(2))


(1200, 3)
   Unnamed: 0      label                                               text
0           0  Psoriasis  I have been experiencing a skin rash on my arm...
1           1  Psoriasis  My skin has been peeling, especially on my kne...


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encode disease labels as integers
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['label'])

# Split into train/validation (80%/20%)
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label_enc'], random_state=42)
print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}")


Train size: 960, Validation size: 240


In [5]:
from datasets import Dataset

# Create HF Datasets (with 'text' and 'label' fields)
train_dataset = Dataset.from_pandas(train_df[['text','label_enc']].rename(columns={'label_enc':'label'}))
val_dataset   = Dataset.from_pandas(val_df[['text','label_enc']].rename(columns={'label_enc':'label'}))

print(train_dataset[0])


{'text': "I'm feeling really nauseous and uneasy. I'm not sure what it might be. I've seen rashes on my arms and legs. I have lost my appetite and feel exhausted every day.", 'label': 4, '__index_level_0__': 189}


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ✅ Load BioBERT model and tokenizer
model_name = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_)).to(device)

# ✅ Tokenize function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

# ✅ Tokenize datasets
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# ✅ Remove raw text (not needed anymore)
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

# ✅ Format datasets for PyTorch
train_dataset.set_format("torch")
val_dataset.set_format("torch")


pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [9]:
import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW  # <- FIXED
from transformers import get_scheduler
from tqdm.auto import tqdm

# ✅ Prepare Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# ✅ Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ✅ Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

# ✅ Loss function
loss_fn = CrossEntropyLoss()



In [12]:
from datasets import Dataset

# 🔁 Recreate datasets with correct label field
train_dataset = Dataset.from_pandas(train_df[['text', 'label_enc']].rename(columns={'label_enc': 'labels'}))
val_dataset = Dataset.from_pandas(val_df[['text', 'label_enc']].rename(columns={'label_enc': 'labels'}))


In [17]:
print(train_dataset.column_names)
# Should output only: ['labels', 'input_ids', 'attention_mask']


['labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [15]:
from datasets import Dataset

# ✅ Make a copy of the DataFrames and drop index before converting
train_df_clean = train_df[['text', 'label_enc']].rename(columns={'label_enc': 'labels'}).reset_index(drop=True)
val_df_clean   = val_df[['text', 'label_enc']].rename(columns={'label_enc': 'labels'}).reset_index(drop=True)

# ✅ Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df_clean)
val_dataset   = Dataset.from_pandas(val_df_clean)


In [16]:
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

train_dataset.set_format("torch")
val_dataset.set_format("torch")


Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [19]:
from torch.optim import AdamW
from transformers import get_scheduler
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from tqdm.auto import tqdm

# Loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer,
                             num_warmup_steps=0, num_training_steps=num_training_steps)

# Training Loop
EPOCHS = 15
for epoch in range(EPOCHS):
    print(f"\n🟢 Epoch {epoch+1}/{EPOCHS}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}  # ✅ No rename needed now
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"✅ Average Loss: {avg_loss:.4f}")



🟢 Epoch 1/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.7633

🟢 Epoch 2/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.3124

🟢 Epoch 3/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.1100

🟢 Epoch 4/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0720

🟢 Epoch 5/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0680

🟢 Epoch 6/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0712

🟢 Epoch 7/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0717

🟢 Epoch 8/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0759

🟢 Epoch 9/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0724

🟢 Epoch 10/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0641

🟢 Epoch 11/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0718

🟢 Epoch 12/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0682

🟢 Epoch 13/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0725

🟢 Epoch 14/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0652

🟢 Epoch 15/15


Training:   0%|          | 0/60 [00:00<?, ?it/s]

✅ Average Loss: 1.0727


In [20]:
from sklearn.metrics import classification_report

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())

    print(classification_report(all_labels, all_preds, target_names=le.classes_))

# 🔍 Evaluate
evaluate_model(model, val_loader)


                                 precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00        10
                      Arthritis       1.00      1.00      1.00        10
               Bronchial Asthma       1.00      1.00      1.00        10
           Cervical spondylosis       1.00      1.00      1.00        10
                    Chicken pox       0.86      0.60      0.71        10
                    Common Cold       0.91      1.00      0.95        10
                         Dengue       1.00      0.60      0.75        10
          Dimorphic Hemorrhoids       1.00      1.00      1.00        10
               Fungal infection       0.91      1.00      0.95        10
                   Hypertension       1.00      1.00      1.00        10
                       Impetigo       0.83      1.00      0.91        10
                       Jaundice       1.00      1.00      1.00        10
                        Malaria       1.00      1.

In [22]:
# Peek at one example
print(meddialog["train"][0])


{'description': 'throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.', 'utterances': {'speaker': [0, 1], 'utterance': ['throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.', "during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)"]}}


In [23]:
from datasets import load_dataset

# Load with trust_remote_code=True to bypass prompt
meddialog = load_dataset("bigbio/meddialog", "meddialog_en_source", trust_remote_code=True)

# Extract Q&A pairs
qa_pairs = []
for split in ["train", "validation", "test"]:
    for convo in meddialog[split]:
        utterances = convo["utterances"]
        for i in range(len(utterances) - 1):
            if (utterances[i]["speaker"].lower() == "patient" and
                utterances[i + 1]["speaker"].lower() == "doctor"):
                question = utterances[i]["utterance"]
                answer = utterances[i + 1]["utterance"]
                qa_pairs.append({"question": question, "answer": answer})

print(f"✅ Extracted {len(qa_pairs)} Q&A pairs.")



KeyError: 0

In [25]:
# 📌 Step 12: Load MedDialog Dataset (Fixed Version)
from datasets import load_dataset

# Load English doctor-patient conversations
print("\n🔍 Loading MedDialog dataset...")
meddialog = load_dataset("bigbio/meddialog", "meddialog_en_source")

# Extract doctor answers following patient questions
print("⚙️ Extracting Q&A pairs...")
doctor_answers = []

for split in ["train", "validation", "test"]:
    for convo in meddialog[split]:
        # Check different possible utterance formats
        try:
            # Modern format: utterances are dictionaries with 'speaker' and 'utterance' fields
            utterances = convo["utterances"]
            for i in range(len(utterances)-1):
                current = utterances[i]
                next_utterance = utterances[i+1]

                # Check if format is {speaker: ..., utterance: ...}
                if (current.get("speaker") == 0 and
                    next_utterance.get("speaker") == 1):
                    doctor_answers.append(next_utterance["utterance"])

        except KeyError:
            # Fallback for alternative format
            try:
                # Some versions use list of [speaker_id, utterance] pairs
                utterances = convo["utterances"]
                for i in range(len(utterances)-1):
                    current = utterances[i]
                    next_utt = utterances[i+1]
                    if current[0] == 0 and next_utt[0] == 1:
                        doctor_answers.append(next_utt[1])
            except:
                continue

print(f"✅ Total doctor answers collected: {len(doctor_answers)}")

# Verify sample output
print("\nSample doctor answer:", doctor_answers[0][:200] + "...")


🔍 Loading MedDialog dataset...
⚙️ Extracting Q&A pairs...
✅ Total doctor answers collected: 0


IndexError: list index out of range

In [26]:
# 📌 Inspect dataset structure
sample_convo = meddialog["train"][0]
print("\n📄 Sample conversation keys:", sample_convo.keys())
print("📄 First utterance structure:", sample_convo["utterances"][0])


📄 Sample conversation keys: dict_keys(['description', 'utterances'])


KeyError: 0

In [27]:
# 📌 Step 12: Robust MedDialog Processing (Fixed)
from datasets import load_dataset

print("\n🔍 Loading MedDialog dataset...")
meddialog = load_dataset("bigbio/meddialog", "meddialog_en_source")

def extract_answers(convo):
    """Handle different utterance formats safely"""
    answers = []
    utterances = convo["utterances"]

    # Check if utterances are in dictionary format
    if isinstance(utterances, list) and len(utterances) > 0:
        try:
            # Try modern dictionary format first
            for i in range(len(utterances)-1):
                current = utterances[i]
                next_utt = utterances[i+1]

                # Verify dictionary structure
                if isinstance(current, dict) and isinstance(next_utt, dict):
                    if current.get("speaker", -1) == 0 and next_utt.get("speaker", -1) == 1:
                        answers.append(next_utt.get("utterance", ""))
        except:
            # Fallback to list format
            try:
                for i in range(len(utterances)-1):
                    if (isinstance(utterances[i], (list, tuple)) and
                        isinstance(utterances[i+1], (list, tuple))):
                        if utterances[i][0] == 0 and utterances[i+1][0] == 1:
                            answers.append(utterances[i+1][1])
            except:
                pass
    return answers

print("⚙️ Extracting Q&A pairs...")
doctor_answers = []
for split in ["train", "validation", "test"]:
    for convo in meddialog[split]:
        doctor_answers.extend(extract_answers(convo))

print(f"✅ Total doctor answers collected: {len(doctor_answers)}")

# Verify output
if len(doctor_answers) > 0:
    print("\nSample doctor answer:", doctor_answers[0][:200] + "...")
else:
    print("\n❌ No answers extracted - checking dataset structure...")
    sample_convo = meddialog["train"][0]
    print("📄 Conversation keys:", sample_convo.keys())
    print("📄 Utterances type:", type(sample_convo["utterances"]))
    if isinstance(sample_convo["utterances"], list):
        print("📄 First utterance:", sample_convo["utterances"][0])


🔍 Loading MedDialog dataset...
⚙️ Extracting Q&A pairs...
✅ Total doctor answers collected: 0

❌ No answers extracted - checking dataset structure...
📄 Conversation keys: dict_keys(['description', 'utterances'])
📄 Utterances type: <class 'dict'>


In [29]:
# 📌 FINAL CORRECTED Step 12: Proper MedDialog Processing
from datasets import load_dataset

print("\n🔍 Loading MedDialog dataset...")
meddialog = load_dataset("bigbio/meddialog", "meddialog_en_source")

def extract_answers(convo):
    """Process the ACTUAL structure we're seeing:
    Each convo['utterances'] is a dict with:
    {'speaker': [list_of_speaker_ids], 'utterance': [list_of_texts]}
    """
    answers = []
    try:
        # Get parallel lists of speakers and utterances
        speakers = convo["utterances"]["speaker"]
        utterances = convo["utterances"]["utterance"]

        # Iterate through consecutive pairs
        for i in range(len(speakers)-1):
            if speakers[i] == 0 and speakers[i+1] == 1:
                answers.append(utterances[i+1])
    except KeyError as e:
        print(f"⚠️ Missing key in conversation: {e}")
    return answers

print("⚙️ Extracting Q&A pairs...")
doctor_answers = []
for split in ["train", "validation", "test"]:
    for convo in meddialog[split]:
        doctor_answers.extend(extract_answers(convo))

print(f"✅ Total doctor answers collected: {len(doctor_answers)}")

# Verify with first 5 answers
print("\nSample answers:")
for i, ans in enumerate(doctor_answers[:5]):
    print(f"{i+1}. {ans[:100]}...")


🔍 Loading MedDialog dataset...
⚙️ Extracting Q&A pairs...
✅ Total doctor answers collected: 614

Sample answers:
1. during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold ...
2. yes. protection. it is not enough symptoms to say that you are a suspect case of covid19; but, indep...
3. possible. top symptoms include fever, dry cough and sob. an obvious possibility. if so, your best st...
4. in brief: symptoms if you are infected, symptoms will emerge: tiredness, dry cough, fever worsening ...
5. thanks for your question on healthcare magic.i can understand your concern. pneumonia with pregnancy...


In [30]:
# Inspect actual structure
sample_convo = meddialog["train"][0]
print("\n📄 Actual utterance structure:")
print("Speakers:", sample_convo["utterances"]["speaker"][:5])
print("Utterances:", sample_convo["utterances"]["utterance"][:1])


📄 Actual utterance structure:
Speakers: [0, 1]
Utterances: ['throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.']


In [33]:
# Step 13: Setup Semantic Search with Sentence-BERT
from sentence_transformers import SentenceTransformer, util
import torch

print("\nLoading Biomedical Sentence-BERT model...")
# Use smaller model if needed: 'all-MiniLM-L6-v2'
embedder = SentenceTransformer('pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb')
embedder = embedder.to(device)  # Explicitly move to GPU if available
print("Model loaded on:", embedder.device)


Loading Biomedical Sentence-BERT model...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded on: cuda:0


In [34]:
# Step 14: Encode Doctor Answers
print("\nEncoding answers...")
# Filter answers for quality
answer_texts = [ans.strip() for ans in doctor_answers if 20 < len(ans) < 500]
corpus_embeddings = embedder.encode(
    answer_texts,
    convert_to_tensor=True,
    show_progress_bar=True,
    batch_size=64  # Reduce if OOM errors
)
print(f"Encoded {len(answer_texts)} answers")


Encoding answers...


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Encoded 536 answers


In [35]:
# Step 15: Treatment Retrieval Function
def get_treatment_recommendations(disease_query, top_k=5):
    """Retrieve most relevant medical answers using semantic search"""
    # Encode the disease query
    query_embedding = embedder.encode(disease_query, convert_to_tensor=True)

    # Find top matches using cosine similarity
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]

    # Return sorted results with scores
    return [(answer_texts[hit['corpus_id']], hit['score']) for hit in hits]

# Test retrieval
test_disease = "viral pharyngitis"  # From your dataset
print(f"\nTesting retrieval for: {test_disease}")
for i, (ans, score) in enumerate(get_treatment_recommendations(test_disease, 3)):
    print(f"[Result {i+1} | Relevance: {score:.3f}] {ans[:120]}...")

# =====================================
# 📝 Answer Summarization Setup
# =====================================


Testing retrieval for: viral pharyngitis
[Result 1 | Relevance: 0.499] cough,phlegm. at this time your symptoms are characteristic of a viral upper respiratory tract infection, and if your sp...
[Result 2 | Relevance: 0.491] virus. these symptoms are characteristic of a viral upper respiratory tract infection and treatment is resting, drinking...
[Result 3 | Relevance: 0.453] during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or othe...


In [36]:
# 📝 Answer Summarization Setup
# =====================================

# Step 16: Load Text Generation Model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

print("\nLoading text generation model...")
gen_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
gen_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)
print(f"Generation model loaded on {gen_model.device}")


Loading text generation model...


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Generation model loaded on cuda:0


In [37]:
# Step 17: Summary Generation Function
def generate_clinical_summary(disease_name, retrieved_answers):
    """Generate coherent treatment summary from multiple answers"""
    # Combine answers into context
    context = "\n".join([f"- {ans[0]}" for ans in retrieved_answers])

    # Create medical summary prompt
    prompt = f"""As a medical professional, synthesize treatment recommendations for {disease_name} using these insights:
{context}

Clinical Summary:"""

    # Generate summary
    inputs = gen_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = gen_model.generate(
        inputs.input_ids,
        max_length=256,
        num_beams=3,
        temperature=0.7,
        early_stopping=True
    )

    return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test generation
print("\nTesting summary generation...")
test_answers = get_treatment_recommendations(test_disease, 3)
summary = generate_clinical_summary(test_disease, test_answers)
print(f"\nGenerated Summary:\n{summary}")



Testing summary generation...





Generated Summary:
See your doctor if your symptoms are characteristic of a viral upper respiratory tract infection.


In [38]:
# =====================================
# 🖥️ Interactive Symptom Analysis
# =====================================

# Step 18: Integrated Pipeline
def symptom_to_treatment_pipeline():
    print("\n⚕️ Symptom Analysis System - Type 'exit' to quit")

    while True:
        user_input = input("\nPatient Symptoms: ")
        if user_input.lower() in ['exit', 'quit']:
            break

        # 1. Disease Classification
        inputs = tokenizer(user_input, return_tensors="pt", truncation=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = model(**inputs)

        pred_label = torch.argmax(outputs.logits).item()
        disease = le.inverse_transform([pred_label])[0]
        print(f"\nDiagnosis: {disease}")

        # 2. Retrieve Treatment Answers
        treatments = get_treatment_recommendations(disease)

        # 3. Generate Summary
        if treatments:
            print("\nRecommended Treatment Approach:")
            print(generate_clinical_summary(disease, treatments[:3]))

            # Show sources option
            if input("\nShow detailed sources? (y/n): ").lower() == 'y':
                for i, (ans, score) in enumerate(treatments[:3]):
                    print(f"\nSource {i+1} [Relevance: {score:.2f}]:")
                    print(ans[:300] + "...")
        else:
            print("\nNo treatment recommendations found for this condition")

# Start the interactive system
symptom_to_treatment_pipeline()


⚕️ Symptom Analysis System - Type 'exit' to quit

Patient Symptoms: caught and cold

Diagnosis: drug reaction

Recommended Treatment Approach:




Synthesize treatment recommendations for drug reaction.

Show detailed sources? (y/n): y

Source 1 [Relevance: 0.38]:
tingling in extremities may be a sign of peripheral neuropathy, wich is most of the cases caused by inflammatory injury to the nerves. anti-tnf drugs, such as humira, have been reported to be associated to this kind of injury, that can be potentialized when associated to steroids. you should visit y...

Source 2 [Relevance: 0.34]:
in brief: fever antibiotics are supposed to prevent fevers, not cause them. if, however, you are having an allergic reaction to the medication it can cause a fever. but so can viral diseases (common cold, flu, covid-19, etc. ) that are not treatable with antibiotics that are designed to treat bacter...

Source 3 [Relevance: 0.34]:
in brief: being cautious best. any medication that can increase the potential of viral infections should be avoided during a coronavirus outbreak such as we are seeing now. would you like to video or text chat with m