In [1]:
import pandas as pd

df = pd.read_csv("./dataset/Training.csv")
df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Columns: 134 entries, itching to Unnamed: 133
dtypes: float64(1), int64(132), object(1)
memory usage: 5.0+ MB


In [2]:
# Drop empty column
X = df.drop(columns=['Unnamed: 133', 'disease'])   # all symptom one-hot columns
y = df['disease']                                  # the target


In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_enc = le.fit_transform(y)        # y_enc is now ints 0-40



In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=0.20, random_state=42, stratify=y_enc)


In [5]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        random_state=0,
        n_jobs=-1)
model.fit(X_train, y_train)


In [6]:
import numpy as np

probs = model.predict_proba(X_test)     # shape: (n_samples, 41)
print(probs.shape)


(984, 41)


In [7]:
top3_idx = probs.argsort(axis=1)[:, -3:][:, ::-1]   # best → third-best


In [8]:
top3_labels = le.inverse_transform(top3_idx.ravel()).reshape(top3_idx.shape)


In [9]:
top3_probs = np.take_along_axis(probs, top3_idx, axis=1)


In [10]:
out = (pd
       .DataFrame(top3_labels, columns=['rank1', 'rank2', 'rank3'])
       .assign(sample_id=X_test.index))
out.head()


Unnamed: 0,rank1,rank2,rank3,sample_id
0,Hypertension,Varicose veins,Urinary tract infection,101
1,Drug Reaction,Varicose veins,Urinary tract infection,3120
2,Dimorphic hemmorhoids(piles),Typhoid,AIDS,694
3,Hyperthyroidism,Malaria,Arthritis,2374
4,Osteoarthristis,Arthritis,Psoriasis,1163


In [11]:
hits = [y_test[i] in top3_idx[i] for i in range(len(y_test))]
print("Top-3 accuracy:", np.mean(hits))


Top-3 accuracy: 1.0


In [12]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

def top3_acc(estimator, X, y):
    import numpy as np
    probs = estimator.predict_proba(X)
    top3 = probs.argsort(axis=1)[:, -3:]
    hits = (y[:, None] == top3).any(axis=1)
    return hits.mean()

scores = cross_val_score(model, X, y_enc,
                         cv=skf,
                         scoring=top3_acc)
print("CV top-3 accuracy:", scores.mean(), "±", scores.std())


CV top-3 accuracy: 1.0 ± 0.0


In [None]:
import joblib, json

joblib.dump(model, "rf_disease_model.joblib")      # the trained model
joblib.dump(le,    "label_encoder.joblib")         # maps indices ↔︎ disease names
json.dump(X.columns.tolist(), open("symptom_columns.json","w"))  # feature order


: 