In [2]:
# Run this cell only if packages are missing
!pip install -q scikit-learn pandas numpy nltk tqdm


In [3]:
!pip install openpyxl



In [4]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score
import nltk
nltk.download('punkt')  # for tokenization in cleaning (optional)


[nltk_data] Downloading package punkt to C:\Users\Karan
[nltk_data]     Purkait\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
#Load dataset and inspect
import pandas as pd

# The raw string path you fixed previously
path = r"C:\Users\Karan Purkait\Desktop\MEDPRED\cleaned_medical_dataset.xlsx"

# This line will now execute without the ImportError
df = pd.read_excel(path)

In [6]:
#Basic cleaning and exploration
# Inspect relevant columns
print("Columns:", df.columns.tolist())
# Replace empty strings with NaN and strip whitespace
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
# Show null counts
print(df.isnull().sum())

# Focus rows that have Symptoms (we cannot predict without Symptoms)
print("Rows with missing Symptoms:", df['Symptoms'].isnull().sum())
df = df[ df['Symptoms'].notna() ]  # drop rows without symptoms


Columns: ['PatientID', 'Age', 'Gender', 'Symptoms', 'Causes', 'Disease', 'Medicine', 'Allergies', 'Severity', 'PrescriptionCost', 'FollowUpRequired']
PatientID              0
Age                  767
Gender                 0
Symptoms               0
Causes                 0
Disease                0
Medicine               0
Allergies           1152
Severity               0
PrescriptionCost       0
FollowUpRequired       0
dtype: int64
Rows with missing Symptoms: 0


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [7]:
#Normalize and clean text (Symptoms and Medicine)
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    # remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    # collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Clean Symptoms
df['Symptoms_clean'] = df['Symptoms'].astype(str).apply(clean_text)

# Normalize Medicine column (split comma-separated)
def split_medicines(med_str):
    if pd.isnull(med_str): 
        return []
    # some records might use commas or semicolons
    parts = re.split(r"[;,/|]+", str(med_str))
    parts = [p.strip().lower() for p in parts if p.strip() and p.strip().lower() not in ['unknown', 'none', 'n/a']]
    return list(dict.fromkeys(parts))  # remove duplicates while preserving order

df['Medicine_list'] = df['Medicine'].apply(split_medicines)
# Clean Disease too
df['Disease_clean'] = df['Disease'].astype(str).apply(lambda x: x.strip() if pd.notna(x) else "unknown")
df['Disease_clean'] = df['Disease_clean'].replace({'unknown': np.nan})


In [8]:
# Check distributions: How many unique medicines
ml_counts = df['Medicine_list'].apply(len).value_counts().sort_index()
print("Medicine items per row distribution:\n", ml_counts)
# Show top medicines
from collections import Counter
all_meds = Counter([m for sub in df['Medicine_list'] for m in sub])
print("Top 20 medicines in dataset:")
print(all_meds.most_common(20))


Medicine items per row distribution:
 Medicine_list
0     747
1    2022
2    2231
Name: count, dtype: int64
Top 20 medicines in dataset:
[('rest', 1465), ('fluids', 504), ('pain relievers', 457), ('nsaids', 363), ('lifestyle', 316), ('antibiotics', 212), ('oral rehydration', 189), ('eye exercises', 173), ('dimenhydrinate', 171), ('antihistamine', 165), ('stretching', 164), ('isolation', 163), ('bland diet', 157), ('pt', 151), ('therapy', 144), ('medication', 144), ('antacids', 139), ('iron supplements', 109), ('levothyroxine', 105), ('cognitive therapy', 99)]


In [9]:
# Create dataset for disease prediction(Symptoms → Disease) — single-label classification
df_disease = df.dropna(subset=['Disease_clean']).copy()
print("Disease dataset size:", df_disease.shape)

# Create dataset for medicine prediction (only rows with at least 1 medicine)(Symptoms → Medicines) — multi-label classification
df_med = df[ df['Medicine_list'].apply(len) > 0 ].copy()
print("Medicine dataset size:", df_med.shape)


Disease dataset size: (5000, 14)
Medicine dataset size: (4253, 14)


In [10]:
##Feature extraction: TF-IDF on Symptoms
# Choose tf-idf settings (you can tune max_features)
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000, stop_words='english')

# Fit on whole corpus of symptoms (or only training split later)
tfidf.fit(df['Symptoms_clean'].values)
# Transform
X_all = tfidf.transform(df['Symptoms_clean'].values)


In [11]:
#Disease model: train/test split and model training
# Encode disease labels
le_disease = LabelEncoder()
y_disease = le_disease.fit_transform(df_disease['Disease_clean'])

# Split
X_train, X_test, y_train, y_test = train_test_split(
    tfidf.transform(df_disease['Symptoms_clean'].values), y_disease,
    test_size=0.2, random_state=42, stratify=y_disease
)

# Use a simple classifier (One-vs-Rest with LogisticRegression works well)
disease_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
disease_clf.fit(X_train, y_train)

# Predictions & evaluation
y_pred = disease_clf.predict(X_test)
print("Disease Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report (Disease):")
print(classification_report(y_test, y_pred, target_names=le_disease.classes_))


Disease Accuracy: 0.822
Classification report (Disease):
                               precision    recall  f1-score   support

                                    0.00      0.00      0.00         3
              Allergic Reacti       0.00      0.00      0.00         9
            Allergic Reaction       0.72      1.00      0.84        33
         Allergic Reaction An       0.00      0.00      0.00         4
             Anxiety Disorder       1.00      1.00      1.00        29
                    Arthritis       0.85      1.00      0.92        57
                       Asthma       0.00      0.00      0.00         7
                     COVID-19       1.00      1.00      1.00        33
           Chronic Bronchitis       0.00      0.00      0.00         3
              Chronic Fatigue       0.00      0.00      0.00        19
     Chronic Fatigue Syndrome       0.52      1.00      0.68        44
                  Common Cold       1.00      1.00      1.00        11
              Coron

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [12]:
#Medicine model: Multi-label preparation
# Prepare X and multi-label y
X = tfidf.transform(df_med['Symptoms_clean'].values)
mlb = MultiLabelBinarizer(sparse_output=False)
Y = mlb.fit_transform(df_med['Medicine_list'].values)

print("Number of distinct medicines (labels):", len(mlb.classes_))
# Optionally, drop extremely rare medicines (threshold)
label_counts = Y.sum(axis=0)
rare_threshold = 5  # drop medicines present in fewer than 5 records (tune as needed)
keep_mask = label_counts >= rare_threshold
print("Keeping labels:", keep_mask.sum(), " / ", len(mlb.classes_))

# Filter labels if desired:
if keep_mask.sum() < len(mlb.classes_):
    kept_labels = np.array(mlb.classes_)[keep_mask]
    # recreate smaller multi-label binarizer
    mlb2 = MultiLabelBinarizer(classes=list(kept_labels))
    Y = mlb2.fit_transform(df_med['Medicine_list'].values)
    mlb = mlb2  # replace
    print("Reduced number of labels to:", len(mlb.classes_))


Number of distinct medicines (labels): 47
Keeping labels: 47  /  47


In [13]:
#Train/test split and train medicine recommender
X_train_m, X_test_m, Y_train_m, Y_test_m = train_test_split(X, Y, test_size=0.2, random_state=42)

# Use One-vs-Rest logistic regression (fast & interpretable)
med_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
med_clf.fit(X_train_m, Y_train_m)

# Predict and evaluate: for multi-label, use examples of metrics
Y_pred_m = med_clf.predict(X_test_m)

# Example metric: micro/macro f1 score
print("Medicine micro F1:", f1_score(Y_test_m, Y_pred_m, average='micro'))
print("Medicine macro F1:", f1_score(Y_test_m, Y_pred_m, average='macro'))


Medicine micro F1: 0.8956780923994039
Medicine macro F1: 0.6487784927329718


In [14]:
# Print top labels and metrics for a few important medicines
from sklearn.metrics import classification_report
print(classification_report(Y_test_m, Y_pred_m, target_names=mlb.classes_, zero_division=0))


                    precision    recall  f1-score   support

     acetaminophen       1.00      1.00      1.00         2
 albuterol inhaler       0.00      0.00      0.00         5
        amlodipine       1.00      1.00      1.00         9
          antacids       0.88      0.79      0.83        28
       antibiotics       1.00      1.00      1.00        49
     antihistamine       0.80      1.00      0.89        32
        antivirals       0.88      1.00      0.93        28
           aspirin       1.00      0.62      0.77        16
             ation       0.00      0.00      0.00         3
           axation       0.00      0.00      0.00         3
     beta-blockers       0.00      0.00      0.00         5
        bland diet       0.87      1.00      0.93        39
              cids       0.00      0.00      0.00         3
 cognitive therapy       1.00      1.00      1.00        24
      cpap machine       0.65      1.00      0.79        15
    dimenhydrinate       1.00      1.00

In [None]:
#Create an interactive prediction function
def predict_from_symptoms(symptom_text, top_k_meds=5):
    """
    Input: symptom_text (str)
    Output: predicted disease (label + probability), list of medicines with probabilities
    """
    s = clean_text(symptom_text)
    Xv = tfidf.transform([s])
    # disease
    try:
        disease_proba = disease_clf.decision_function(Xv)
        # For OneVsRest + logistic, decision_function returns array of shape (n_classes,)
        # Convert to probabilities via softmax-like scaling for single-label
        # We'll use argmax on decision function as predicted label:
        best_idx = int(disease_clf.predict(Xv)[0])
        pred_disease = le_disease.inverse_transform([best_idx])[0]
    except Exception as e:
        pred_disease = None

    # medicines prediction: get probability-like scores
    try:
        # For logistic regression, we can use predict_proba for each estimator, if available
        med_probs = []
        if hasattr(med_clf, "estimators_"):
            # compute probability for each label (OneVsRest estimators)
            for est in med_clf.estimators_:
                if hasattr(est, "predict_proba"):
                    prob = est.predict_proba(Xv)[0][1]
                else:
                    # fallback to decision_function and transform via sigmoid
                    val = est.decision_function(Xv)[0]
                    prob = 1.0 / (1.0 + np.exp(-val))
                med_probs.append(prob)
            med_probs = np.array(med_probs)
            top_indices = np.argsort(-med_probs)[:top_k_meds]
            meds = [(mlb.classes_[i], float(med_probs[i])) for i in top_indices if med_probs[i] > 0.05]  # threshold
        else:
            meds = []
    except Exception as e:
        meds = []

    return {
        "input_symptoms": symptom_text,
        "predicted_disease": pred_disease,
        "recommended_medicines": meds
    }

# Example usage
example = "stomach pain with liquide shit"
print(predict_from_symptoms(example, top_k_meds=7))


{'input_symptoms': 'stomach pain with liquide shit', 'predicted_disease': 'Gastroenteritis', 'recommended_medicines': [('fluids', 0.44950373894399276), ('bland diet', 0.3812715993417521), ('rest', 0.11109411214304554)]}


In [16]:
y_pred = disease_clf.predict(X_test)


In [None]:
#check accuracy
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Disease Model Accuracy:", round(accuracy * 100, 2), "%")

# Detailed per-class performance
print("\nClassification Report (Disease Model):")
print(classification_report(y_test, y_pred, target_names=le_disease.classes_))

# # Import the plotting libraries

# import matplotlib.pyplot as plt
# import seaborn as sns

# # Confusion matrix
# from sklearn.metrics import confusion_matrix

# cm = confusion_matrix(y_test, y_pred)
# plt.figure(figsize=(10, 8))
# sns.heatmap(cm, cmap='Blues',
#             xticklabels=le_disease.classes_,
#             yticklabels=le_disease.classes_,
#             fmt='d')
# plt.title('Disease Prediction Confusion Matrix')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.show()



Disease Model Accuracy: 82.2 %

Classification Report (Disease Model):
                               precision    recall  f1-score   support

                                    0.00      0.00      0.00         3
              Allergic Reacti       0.00      0.00      0.00         9
            Allergic Reaction       0.72      1.00      0.84        33
         Allergic Reaction An       0.00      0.00      0.00         4
             Anxiety Disorder       1.00      1.00      1.00        29
                    Arthritis       0.85      1.00      0.92        57
                       Asthma       0.00      0.00      0.00         7
                     COVID-19       1.00      1.00      1.00        33
           Chronic Bronchitis       0.00      0.00      0.00         3
              Chronic Fatigue       0.00      0.00      0.00        19
     Chronic Fatigue Syndrome       0.52      1.00      0.68        44
                  Common Cold       1.00      1.00      1.00        11
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [30]:
Y_pred_m = med_clf.predict(X_test_m)


In [None]:
#Medicine Recommendation Model Accuracy (Multi-label),check metrics
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report

# Example overall metrics
print("Micro F1 Score:", f1_score(Y_test_m, Y_pred_m, average='micro'))
print("Macro F1 Score:", f1_score(Y_test_m, Y_pred_m, average='macro'))
print("Micro Precision:", precision_score(Y_test_m, Y_pred_m, average='micro'))
print("Micro Recall:", recall_score(Y_test_m, Y_pred_m, average='micro'))

# Optional overall subset accuracy (exact match accuracy)
subset_acc = accuracy_score(Y_test_m, Y_pred_m)
print("Subset Accuracy (exact medicine match):", subset_acc)

# Detailed report per medicine
print("\nClassification Report (Medicine Model):")
print(classification_report(Y_test_m, Y_pred_m, target_names=mlb.classes_))


Micro F1 Score: 0.8956780923994039
Macro F1 Score: 0.6487784927329718
Micro Precision: 0.8844738778513613
Micro Recall: 0.9071698113207547
Subset Accuracy (exact medicine match): 0.8719153936545241

Classification Report (Medicine Model):
                    precision    recall  f1-score   support

     acetaminophen       1.00      1.00      1.00         2
 albuterol inhaler       0.00      0.00      0.00         5
        amlodipine       1.00      1.00      1.00         9
          antacids       0.88      0.79      0.83        28
       antibiotics       1.00      1.00      1.00        49
     antihistamine       0.80      1.00      0.89        32
        antivirals       0.88      1.00      0.93        28
           aspirin       1.00      0.62      0.77        16
             ation       0.00      0.00      0.00         3
           axation       0.00      0.00      0.00         3
     beta-blockers       0.00      0.00      0.00         5
        bland diet       0.87      1.00 

In [43]:
X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         5000 non-null   int64  
 1   Age               4233 non-null   float64
 2   Gender            5000 non-null   object 
 3   Symptoms          5000 non-null   object 
 4   Causes            5000 non-null   object 
 5   Medicine          5000 non-null   object 
 6   Allergies         3848 non-null   object 
 7   Severity          5000 non-null   object 
 8   PrescriptionCost  5000 non-null   int64  
 9   FollowUpRequired  5000 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 390.8+ KB


In [19]:
#Saving trained model
import joblib

# ✅ Retrain your RandomForest model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# ✅ Save model and label encoder again (overwrite old ones)
joblib.dump(rf, "disease_prediction_model.pkl")
joblib.dump(le_disease, "label_encoder.pkl")
print("✅ Model and encoder saved successfully!")



✅ Model and encoder saved successfully!


In [21]:
import pandas as pd

df = pd.read_excel("cleaned_medical_dataset.xlsx")
print(df.columns.tolist())
df.head()


['PatientID', 'Age', 'Gender', 'Symptoms', 'Causes', 'Disease', 'Medicine', 'Allergies', 'Severity', 'PrescriptionCost', 'FollowUpRequired']


Unnamed: 0,PatientID,Age,Gender,Symptoms,Causes,Disease,Medicine,Allergies,Severity,PrescriptionCost,FollowUpRequired
0,1,45.0,Male,"Fever, Cough",Viral Infection,Common Cold,"Ibuprofen, Rest",Dust,Mild,800,No
1,2,33.0,Female,"Headache, Fatigue",Stress,Migraine,Sumatriptan,Pollen,Severe,637,No
2,3,50.0,Male,Shortness of breath,Pollution,Asthma,Albuterol Inhaler,Peanuts,Severe,1082,Yes
3,4,37.0,Female,"Nausea, Vomiting",Food Poisoning,Gastroenteritis,Oral Rehydration,Pollen,Mild,1147,Yes
4,5,24.0,Male,Sore Throat,Bacterial Infection,Strep Throat,Penicillin,Pollen,Mild,1003,No


In [23]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_excel("cleaned_medical_dataset.xlsx")

# Target column
target_column = "Disease"

# Drop non-feature columns
X = df.drop(["PatientID", "Disease", "Medicine", "PrescriptionCost"], axis=1, errors="ignore")
y = df[target_column]

# Fill missing values or 'Unknown' text with 'None' to handle safely
X = X.fillna("None")
X = X.replace("Unknown", "None")

# Label encode categorical columns automatically
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

# Encode target
le_disease = LabelEncoder()
y_encoded = le_disease.fit_transform(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Save model and label encoder
joblib.dump(model, "disease_prediction_model.pkl")
joblib.dump(le_disease, "label_encoder.pkl")

print("✅ Model retrained successfully and saved as 'disease_prediction_model.pkl'")


✅ Model retrained successfully and saved as 'disease_prediction_model.pkl'


In [25]:
import pandas as pd

df = pd.read_excel("cleaned_medical_dataset.xlsx")
print(df.columns.tolist())


['PatientID', 'Age', 'Gender', 'Symptoms', 'Causes', 'Disease', 'Medicine', 'Allergies', 'Severity', 'PrescriptionCost', 'FollowUpRequired']
