<a href="https://colab.research.google.com/github/NehaNemali/Arogya_ai/blob/main/arogya_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load your dataset
df = pd.read_csv("Symptom2Disease.csv")
df = df[['text', 'label']].dropna()

# Encode target labels
label_encoder = LabelEncoder()
df['label_enc'] = label_encoder.fit_transform(df['label'])

# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label_enc'].tolist(), test_size=0.2, random_state=42
)

num_labels = len(label_encoder.classes_)


In [19]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class SymptomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Datasets and Dataloaders
train_dataset = SymptomDataset(train_texts, train_labels, tokenizer)
val_dataset = SymptomDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [20]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device);



In [21]:
from torch.optim import AdamW
from transformers import get_scheduler


optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 4
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

from torch.nn import CrossEntropyLoss
from tqdm import tqdm


In [22]:
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    loop = tqdm(train_loader, leave=True)
    total_loss = 0

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())


Epoch 1/4


Epoch 1: 100%|██████████| 60/60 [09:58<00:00,  9.97s/it, loss=2.8]


Epoch 2/4


Epoch 2: 100%|██████████| 60/60 [10:00<00:00, 10.02s/it, loss=2.16]


Epoch 3/4


Epoch 3: 100%|██████████| 60/60 [10:04<00:00, 10.08s/it, loss=1.7]


Epoch 4/4


Epoch 4: 100%|██████████| 60/60 [10:24<00:00, 10.40s/it, loss=1.6]


In [23]:
from sklearn.metrics import classification_report

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Evaluation Report
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_, zero_division=0))



                                 precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         7
                      Arthritis       0.91      1.00      0.95        10
               Bronchial Asthma       0.92      1.00      0.96        11
           Cervical spondylosis       1.00      1.00      1.00         7
                    Chicken pox       0.78      0.58      0.67        12
                    Common Cold       0.92      0.92      0.92        12
                         Dengue       0.86      0.50      0.63        12
          Dimorphic Hemorrhoids       1.00      1.00      1.00         7
               Fungal infection       0.87      1.00      0.93        13
                   Hypertension       0.77      1.00      0.87        10
                       Impetigo       0.79      1.00      0.88        11
                       Jaundice       1.00      1.00      1.00        11
                        Malaria       1.00      1.