In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pickle

# Load dataset
df = pd.read_csv("data/disease_dataset.csv")

# Combine all symptom columns into a list
symptom_cols = [col for col in df.columns if col.startswith("Symptom")]
df[symptom_cols] = df[symptom_cols].fillna("")

# Merge symptoms into one column
df['all_symptoms'] = df[symptom_cols].values.tolist()
df['all_symptoms'] = df['all_symptoms'].apply(lambda x: [i for i in x if i])  # remove empty

# Use MultiLabelBinarizer to create binary features for each symptom
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['all_symptoms'])

# Target
y = df['Disease']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Save model and symptom labels
with open("model/model.pkl", "wb") as f:
    pickle.dump((model, mlb.classes_), f)

print("✅ Model trained and saved.")


✅ Model trained and saved.
