In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import re
from google.colab import drive


In [26]:

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
path = "/content/drive/MyDrive/veterinary_clinical_data.csv"

df = pd.read_csv(path)
print(df.shape)
df.head()

(10000, 10)


Unnamed: 0,AnimalName,Breed,Age,Weight_kg,MedicalHistory,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5
0,Dog,Rottweiler,6.0,32.1,Chronic illness,Anorexia,Hydrophobia,drooping ears,Diarrhea,Shyness or aggression
1,Dog,Bulldog,9.9,18.5,Vaccinated,Lethargy,Weakness,Horny growth,Fever,Coughing
2,Dog,Beagle,13.9,18.9,Parasite history,Pain,Weight loss,Weight loss,Sneezing,Drop on egg production
3,cat,Scottish Fold,5.8,6.3,Recent surgery,Vomiting,Pain,Edema in lower jaw,Pain,Weakness
4,cat,Persian,11.2,4.9,Recent surgery,Severe,Weight loss,Egg production stops,Weakness,Poor Body condition


In [27]:
# Combine symptom columns into one text field
symptom_cols = [c for c in df.columns if c.lower().startswith("symptom")]

df["combined_symptoms"] = df[symptom_cols].fillna("").astype(str).agg(" ".join, axis=1)

# Cleaning function for text
def clean_text(s):
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["combined_symptoms"] = df["combined_symptoms"].apply(clean_text)

# Define features (Symptoms + Breed) and label (Disease)
X = df[["combined_symptoms", "Breed"]]
y = df["MedicalHistory"].astype(str)

print("Unique Diseases:", y.nunique())
print("Sample Diseases:", y.unique()[:10])



Unique Diseases: 11
Sample Diseases: ['Chronic illness' 'Vaccinated' 'Parasite history' 'Recent surgery'
 'Dental issues' 'Previous heart condition' 'Previous kidney disease'
 'Allergies' 'Skin conditions history' 'No previous conditions']


In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [29]:
# Preprocessor: TF-IDF for symptoms + OneHot for breed
preprocessor = ColumnTransformer([
    ("text", TfidfVectorizer(max_features=2000, ngram_range=(1,2)), "combined_symptoms"),
    ("breed", OneHotEncoder(handle_unknown="ignore"), ["Breed"])
])

# Random Forest model
model = Pipeline([
    ("pre", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42))
])

# Train
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("🌳 Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



🌳 Random Forest Results:
Accuracy: 0.345
                          precision    recall  f1-score   support

               Allergies       0.31      0.30      0.31       179
         Chronic illness       0.33      0.36      0.35       188
           Dental issues       0.40      0.38      0.39       178
  No previous conditions       0.34      0.36      0.35       199
          Not vaccinated       0.38      0.42      0.40       184
        Parasite history       0.28      0.28      0.28       178
Previous heart condition       0.33      0.31      0.32       160
 Previous kidney disease       0.31      0.26      0.28       174
          Recent surgery       0.40      0.40      0.40       191
 Skin conditions history       0.37      0.39      0.38       203
              Vaccinated       0.31      0.31      0.31       166

                accuracy                           0.34      2000
               macro avg       0.34      0.34      0.34      2000
            weighted avg       0.

In [35]:
# 🧑‍⚕️ Define some random doctors and their specialties
doctor_directory = {
    "Chronic illness": {"Doctor": "Dr. Nikhil Sinha", "Specialist": "Infectious Disease Specialist"},
    "Vaccinated": {"Doctor": "Dr. Riya Sharma", "Specialist": "Dermatologist"},
    "Parasite history": {"Doctor": "Dr. Vivek Joshi", "Specialist": "ENT Specialist"},
    "Recent surgery": {"Doctor": "Dr. Karan Mehta", "Specialist": "Internal Medicine Specialist"},
    "Dental issues": {"Doctor": "Dr. Priya Das", "Specialist": "Parasitologist"},
    "Previous heart condition": {"Doctor": "Dr. Ananya Menon", "Specialist": "Gastroenterologist"},
    "Previous kidney disease": {"Doctor": "Dr. Sneha Patel", "Specialist": "Orthopedic Specialist"},
    "Allergies": {"Doctor": "Dr. Shalini Iyer", "Specialist": "Infectious Disease Expert"},
    "Parasite history": {"Doctor": "Dr. Rohan Kapoor", "Specialist": "Dental Surgeon"},
    "Skin conditions history": {"Doctor": "Dr. Kavita Rao", "Specialist": "Ophthalmologist"},
    "Parasite history": {"Doctor": "Dr. Aditi Singh", "Specialist": "General Physician"},
}

# === After prediction ===
symptoms = input("Enter symptoms (space or comma separated): ").strip()
breed = input("Enter breed: ").strip()
symptoms = clean_text(symptoms)

new_data = pd.DataFrame({
    "combined_symptoms": [symptoms],
    "Breed": [breed]
})

predicted_disease = model.predict(new_data)[0]
print(f"\n🐶 Predicted Disease: {predicted_disease}")

# Doctor allocation
if predicted_disease in doctor_directory:
    doc_info = doctor_directory[predicted_disease]
    print(f"👨‍⚕️ Assigned Doctor: {doc_info['Doctor']}")
    print(f"🔬 Specialist In: {doc_info['Specialist']}")
else:
    print("⚕️ No specific doctor found — assign to General Vet Specialist.")




Enter symptoms (space or comma separated): Diarrhea Weight loss Lethargy Succumb Pain 
Enter breed: Boxer

🐶 Predicted Disease: Dental issues
👨‍⚕️ Assigned Doctor: Dr. Priya Das
🔬 Specialist In: Parasitologist
