In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
df = pd.read_csv("mild_illness_prescription_dataset.csv")
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["symptoms"])
le = LabelEncoder()
y = le.fit_transform(df["condition_label"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("📊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("🧩 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
import joblib
joblib.dump(model, "illness_model.pkl")
joblib.dump(vectorizer, "symptom_vectorizer.pkl")
joblib.dump(le, "label_encoder.pkl")

📊 Classification Report:
                   precision    recall  f1-score   support

          acidity       1.00      1.00      1.00         9
allergic_rhinitis       1.00      1.00      1.00        10
        body_pain       1.00      1.00      1.00         5
       cold_sores       1.00      1.00      1.00         4
      common_cold       1.00      1.00      1.00        16
     constipation       1.00      1.00      1.00         7
      dehydration       1.00      1.00      1.00         7
         diarrhea       1.00      1.00      1.00         8
        dry_cough       1.00      1.00      1.00         4
       eye_strain       1.00      1.00      1.00        11
         headache       1.00      1.00      1.00        10
      indigestion       1.00      1.00      1.00         7
      insect_bite       1.00      0.86      0.92         7
 menstrual_cramps       1.00      1.00      1.00         8
       mild_fever       1.00      1.00      1.00         8
mild_skin_allergy       1.00  

['label_encoder.pkl']

In [2]:
pip install transformers datasets scikit-learn pandas torch





In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
df = pd.read_csv("mild_illness_prescription_dataset.csv")
le = LabelEncoder()
df["label"] = le.fit_transform(df["condition_label"])
import joblib
joblib.dump(le, "label_encoder_distilbert.pkl")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
class SymptomDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(df["symptoms"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42)

train_dataset = SymptomDataset(train_texts, train_labels)
val_dataset = SymptomDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(le.classes_))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loop.set_postfix(loss=loss.item())

print("✅ Training complete!")
model.save_pretrained("distilbert-illness-2")
tokenizer.save_pretrained("distilbert-illness-2")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:37<00:00,  1.33it/s, loss=1.37]
Epoch 2: 100%|█████████████████████████████████████████████████████████████| 50/50 [00:38<00:00,  1.31it/s, loss=0.409]
Epoch 3: 100%|█████████████████████████████████████████████████████████████| 50/50 [00:37<00:00,  1.34it/s, loss=0.117]
Epoch 4: 100%|████████████████████████████████████████████████████████████| 50/50 [00:38<00:00,  1.31it/s, loss=0.0908]


✅ Training complete!


('distilbert-illness-2\\tokenizer_config.json',
 'distilbert-illness-2\\special_tokens_map.json',
 'distilbert-illness-2\\vocab.txt',
 'distilbert-illness-2\\added_tokens.json',
 'distilbert-illness-2\\tokenizer.json')

In [None]:
import torch
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import joblib
model = DistilBertForSequenceClassification.from_pretrained("distilbert-illness")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-illness")
label_encoder = joblib.load("label_encoder_distilbert.pkl")
df = pd.read_csv("mild_illness_prescription_dataset.csv")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
while True:
    symptom_text = input("\n📝 Enter symptoms (or type 'exit' to quit): ").strip()
    if symptom_text.lower() == 'exit':
        print("Exiting prediction loop.")
        break
    inputs = tokenizer(symptom_text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()
        confidence = torch.max(probs).item()

    predicted_label = label_encoder.inverse_transform([pred_class])[0]
    prescription = df[df["condition_label"] == predicted_label]["medicine_name"].values
    prescription = prescription[0] if len(prescription) > 0 else "No prescription found."
    print(f"💡 Predicted illness: {predicted_label} (Confidence: {confidence:.2%})")
    print(f"💊 Recommended prescription: {prescription}")


📝 Enter symptoms (or type 'exit' to quit):  lip blisters


💡 Predicted illness: cold_sores (Confidence: 95.35%)
💊 Recommended prescription: Abzorb
