In [1]:
# Cell 1: Install libraries and Load Dataset
!pip install datasets scikit-learn joblib tqdm

from datasets import load_dataset
import pandas as pd
import re
import numpy as np
import joblib
from scipy import sparse
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tqdm import tqdm

print("Libraries installed and imported.")

# Load the dataset (using the specified split)
print("Loading dataset...")
try:
    dataset = load_dataset("kamruzzaman-asif/Diseases_Dataset", split="dhivyeshrk")
    print("Dataset loaded successfully!")
    print("\nFirst entry example:")
    print(dataset[0])
except Exception as e:
    print(f"Error loading dataset: {e}")
    # Handle error appropriately, maybe stop execution or use a fallback

Libraries installed and imported.
Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/QuyenAnh-00000-of-00001.parquet:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

data/ventis-00000-of-00001.parquet:   0%|          | 0.00/135k [00:00<?, ?B/s]

data/celikmus-00000-of-00001.parquet:   0%|          | 0.00/620k [00:00<?, ?B/s]

data/duxTecblic-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

data/dhivyeshrk-00000-of-00001.parquet:   0%|          | 0.00/3.23M [00:00<?, ?B/s]

data/IndianServers-00000-of-00001.parque(…):   0%|          | 0.00/59.6k [00:00<?, ?B/s]

data/itachi9604-00000-of-00001.parquet:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

data/symptom2disease-00000-of-00001.parq(…):   0%|          | 0.00/76.7k [00:00<?, ?B/s]

Generating QuyenAnh split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating ventis split:   0%|          | 0/6661 [00:00<?, ? examples/s]

Generating celikmus split:   0%|          | 0/1058 [00:00<?, ? examples/s]

Generating duxTecblic split:   0%|          | 0/5634 [00:00<?, ? examples/s]

Generating dhivyeshrk split:   0%|          | 0/246945 [00:00<?, ? examples/s]

Generating IndianServers split:   0%|          | 0/796 [00:00<?, ? examples/s]

Generating itachi9604 split:   0%|          | 0/4920 [00:00<?, ? examples/s]

Generating symptom2disease split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Dataset loaded successfully!

First entry example:
{'Disease': 'panic disorder', 'Symptoms': 'anxiety and nervousness, shortness of breath, depressive or psychotic symptoms, chest tightness, palpitations, irregular heartbeat, breathing fast', 'Treatments': ''}


In [2]:
# Cell 2: Clean and Prepare Data
if 'dataset' in locals(): # Check if dataset loaded successfully
    print("Converting to DataFrame and cleaning data...")
    df = pd.DataFrame(dataset)

    # Rename columns for easier access
    df = df.rename(columns={"Disease": "disease", "Symptoms": "symptoms", "Treatments": "treatments"})

    # Keep only necessary columns
    df = df[["disease", "symptoms", "treatments"]]

    # --- Data Cleaning ---
    # 1. Drop rows where 'symptoms' is missing or empty
    df = df.dropna(subset=['symptoms'])
    df = df[df["symptoms"].str.strip() != ""]

    # 2. Fill missing 'treatments' with a standard placeholder
    df['treatments'] = df['treatments'].fillna("No treatment information available.")
    df['treatments'] = df['treatments'].replace("", "No treatment information available.")
    df['treatments'] = df['treatments'].str.strip() # Remove leading/trailing spaces

    # 3. Define text cleaning function
    def clean_text(text):
        if not isinstance(text, str): # Ensure input is a string
            return ""
        text = text.lower()
        # Remove characters except letters, numbers, spaces, and commas
        text = re.sub(r'[^a-z0-9 ,]', ' ', text)
        # Replace multiple spaces/commas with a single space
        text = re.sub(r'\s+|,+', ' ', text).strip()
        return text

    # Apply cleaning
    df["symptoms_clean"] = df["symptoms"].apply(clean_text)
    df["treatments_clean"] = df["treatments"].apply(clean_text)

    # Remove rows where cleaning resulted in empty symptoms
    df = df[df["symptoms_clean"].str.strip() != ""]

    print(f"Data cleaned. Shape: {df.shape}")
    print("\nCleaned DataFrame head:")
    print(df.head())
else:
    print("Dataset not loaded in Cell 1. Cannot proceed.")

Converting to DataFrame and cleaning data...
Data cleaned. Shape: (246945, 5)

Cleaned DataFrame head:
          disease                                           symptoms  \
0  panic disorder  anxiety and nervousness, shortness of breath, ...   
1  panic disorder  shortness of breath, depressive or psychotic s...   
2  panic disorder  anxiety and nervousness, depression, shortness...   
3  panic disorder  anxiety and nervousness, depressive or psychot...   
4  panic disorder  anxiety and nervousness, depression, insomnia,...   

                            treatments  \
0  No treatment information available.   
1  No treatment information available.   
2  No treatment information available.   
3  No treatment information available.   
4  No treatment information available.   

                                      symptoms_clean  \
0  anxiety and nervousness  shortness of breath  ...   
1  shortness of breath  depressive or psychotic s...   
2  anxiety and nervousness  depression  sho

In [3]:
# Cell 3: Group Data by Disease
if 'df' in locals(): # Check if df exists
    print("Grouping symptoms and treatments by unique disease...")

    # 1. Group symptoms: Concatenate all cleaned symptoms for each disease
    disease_symptoms_docs = df.groupby("disease")["symptoms_clean"].apply(lambda texts: " ".join(texts)).reset_index()
    print(f"Grouped symptoms for {len(disease_symptoms_docs)} unique diseases.")

    # 2. Group treatments: Find the first *meaningful* treatment listed for each disease
    def get_first_valid_treatment(treatments):
        for t in treatments:
            cleaned_t = t.strip()
            if cleaned_t and cleaned_t != "no treatment information available":
                return cleaned_t # Return the first non-empty, non-placeholder treatment
        return "No specific treatment information found in dataset." # Fallback

    disease_treatments_docs = df.groupby("disease")["treatments_clean"].apply(get_first_valid_treatment).reset_index()
    print("Found representative treatment for each disease.")

    # 3. Merge grouped symptoms and treatments
    disease_docs = pd.merge(disease_symptoms_docs, disease_treatments_docs, on="disease")

    # Remove diseases where the combined symptoms are still empty after grouping (unlikely but safe)
    disease_docs = disease_docs[disease_docs["symptoms_clean"].str.strip() != ""]

    print(f"Final master documents created for {len(disease_docs)} diseases.")
    print("\nMaster documents DataFrame head:")
    print(disease_docs.head())
else:
    print("DataFrame 'df' not created in Cell 2. Cannot proceed.")

Grouping symptoms and treatments by unique disease...
Grouped symptoms for 773 unique diseases.
Found representative treatment for each disease.
Final master documents created for 773 diseases.

Master documents DataFrame head:
                     disease  \
0  abdominal aortic aneurysm   
1           abdominal hernia   
2            abscess of nose   
3        abscess of the lung   
4     abscess of the pharynx   

                                      symptoms_clean  \
0  shortness of breath  palpitations  back pain s...   
1  groin mass  symptoms of the scrotum and testes...   
2  sore throat  vomiting  fever  sinus congestion...   
3  depressive or psychotic symptoms  itchy eyelid...   
4  sharp chest pain  difficulty in swallowing  fe...   

                                    treatments_clean  
0  No specific treatment information found in dat...  
1  No specific treatment information found in dat...  
2  No specific treatment information found in dat...  
3  No specific treatme

In [4]:
# Cell 4: Vectorize Symptoms and Save Artifacts
if 'disease_docs' in locals(): # Check if disease_docs exists
    print("Creating TF-IDF matrix...")

    # Initialize TF-IDF Vectorizer
    # Parameters:
    # - ngram_range=(1,2): Use both single words (unigrams) and pairs of words (bigrams)
    # - min_df=3: Ignore terms that appear in less than 3 documents (diseases)
    # - stop_words='english': Remove common English words (like 'the', 'is', 'in')
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, stop_words='english')

    # Fit the vectorizer and transform the symptom documents
    X = vectorizer.fit_transform(disease_docs["symptoms_clean"])

    print(f"TF-IDF Matrix created. Shape: {X.shape}") # Shape should be (num_diseases, num_features)

    # --- Prepare artifacts for saving ---
    # 1. List of diseases (must match the order of rows in X)
    diseases_list = disease_docs["disease"].tolist()

    # 2. Dictionary mapping disease name to treatment info
    treatments_dict = pd.Series(disease_docs.treatments_clean.values, index=disease_docs.disease).to_dict()

    # --- Save the 4 essential files ---
    print("Saving artifacts...")
    try:
        joblib.dump(vectorizer, "vectorizer.pkl")
        joblib.dump(diseases_list, "diseases_list.pkl")
        joblib.dump(treatments_dict, "treatments.pkl")
        sparse.save_npz("disease_tfidf.npz", X)
        print("Successfully saved: vectorizer.pkl, diseases_list.pkl, treatments.pkl, disease_tfidf.npz")
    except Exception as e:
        print(f"Error saving files: {e}")

else:
    print("DataFrame 'disease_docs' not created in Cell 3. Cannot proceed.")

Creating TF-IDF matrix...
TF-IDF Matrix created. Shape: (773, 2872)
Saving artifacts...
Successfully saved: vectorizer.pkl, diseases_list.pkl, treatments.pkl, disease_tfidf.npz


In [6]:
# Cell 5: Evaluate Model Accuracy (Corrected for Stratify Error)
if 'df' in locals() and 'X' in locals(): # Check prerequisites
    print("Preparing data for evaluation...")

    # --- FIX STARTS HERE ---
    # Calculate how many times each disease appears
    disease_counts = df['disease'].value_counts()

    # Identify diseases that appear at least twice (needed for stratify)
    diseases_to_keep = disease_counts[disease_counts >= 2].index.tolist()

    # Filter the original DataFrame to keep only diseases with >= 2 samples
    df_eval = df[df['disease'].isin(diseases_to_keep)]
    print(f"Removed {len(df) - len(df_eval)} rows belonging to single-occurrence diseases.")
    print(f"Using {len(df_eval)} rows for evaluation split.")
    # --- FIX ENDS HERE ---

    # Check if we still have data left after filtering
    if len(df_eval) == 0:
        print("No data remaining after filtering for stratification. Cannot evaluate.")
    else:
        print("Performing Train/Test split for evaluation...")
        # Split the *filtered* data
        train_df, test_df = train_test_split(
            df_eval, # Use the filtered DataFrame
            test_size=0.2,
            random_state=42,
            stratify=df_eval['disease'] # Stratify on the filtered DataFrame
        )
        print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

        # --- Build the model using ONLY the training data ---
        print("Creating model based *only* on training data...")
        # Group training symptoms
        train_disease_symptoms = train_df.groupby("disease")["symptoms_clean"].apply(lambda x: " ".join(x)).reset_index()

        # Get list of diseases present in the training set
        train_diseases_list = train_disease_symptoms["disease"].tolist()

        # Filter test set to only include diseases seen during training (redundant now but safe)
        test_df_filtered = test_df[test_df['disease'].isin(train_diseases_list)]
        print(f"Evaluating on {len(test_df_filtered)} test samples.")

        # Fit a *new* vectorizer ONLY on the training disease documents
        eval_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, stop_words='english')
        X_train = eval_vectorizer.fit_transform(train_disease_symptoms["symptoms_clean"])
        print(f"Evaluation TF-IDF matrix shape: {X_train.shape}")

        # --- Evaluate on the test set ---
        print("Calculating Top-1 and Top-3 accuracy...")
        correct_top1 = 0
        correct_top3 = 0
        total_evaluated = len(test_df_filtered)

        if total_evaluated == 0:
            print("No test samples to evaluate after filtering. Check data splitting.")
        else:
            # Use tqdm for progress bar
            for _, row in tqdm(test_df_filtered.iterrows(), total=total_evaluated, desc="Evaluating"):
                true_disease = row["disease"]
                symptoms = row["symptoms_clean"]

                # Transform symptom using the evaluation vectorizer
                q_eval = eval_vectorizer.transform([symptoms])

                # Calculate similarity against the training matrix
                sims_eval = cosine_similarity(q_eval, X_train).flatten()

                # Get top 3 predictions based on training diseases
                top_3_indices = np.argsort(sims_eval)[::-1][:3]
                # Map indices back to disease names from the training list
                top_3_diseases = [train_diseases_list[i] for i in top_3_indices]

                # Check accuracy
                if top_3_diseases: # Ensure predictions were made
                    if true_disease == top_3_diseases[0]:
                        correct_top1 += 1
                    if true_disease in top_3_diseases:
                        correct_top3 += 1

            # --- Display Results ---
            print("\n--- Evaluation Results ---")
            print(f"Top-1 Accuracy: {correct_top1 / total_evaluated * 100:.2f}%")
            print(f"Top-3 Accuracy: {correct_top3 / total_evaluated * 100:.2f}%")
            print("--------------------------")

else:
      print("Required variables ('df', 'X') not available. Skipping evaluation.")

Preparing data for evaluation...
Removed 19 rows belonging to single-occurrence diseases.
Using 246926 rows for evaluation split.
Performing Train/Test split for evaluation...
Train size: 197540, Test size: 49386
Creating model based *only* on training data...
Evaluating on 49386 test samples.
Evaluation TF-IDF matrix shape: (754, 2817)
Calculating Top-1 and Top-3 accuracy...


Evaluating: 100%|██████████| 49386/49386 [01:21<00:00, 604.84it/s]


--- Evaluation Results ---
Top-1 Accuracy: 78.31%
Top-3 Accuracy: 92.63%
--------------------------





In [7]:
# Cell 6: Download Artifacts
print("Attempting to download the 4 necessary files...")
try:
    files.download("vectorizer.pkl")
    files.download("diseases_list.pkl")
    files.download("treatments.pkl")
    files.download("disease_tfidf.npz")
    print("Downloads initiated. Check your browser.")
except NameError:
    print("Looks like 'files' module is not available (not running in Colab?). Cannot download.")
except Exception as e:
    print(f"An error occurred during download: {e}")

Attempting to download the 4 necessary files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloads initiated. Check your browser.
