In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
df=pd.read_csv('Training_clean.csv')

In [None]:
X = df.drop('prognosis', axis=1)
y_raw = df['prognosis']

In [None]:
# Combine features and label
df['prognosis'] = y_raw
unique_df = df.drop_duplicates()

print("Unique rows after dropping duplicates:", len(unique_df))


Unique rows after dropping duplicates: 304


In [None]:
from sklearn.model_selection import train_test_split

X_unique = unique_df.drop('prognosis', axis=1)
y_unique_raw = unique_df['prognosis']

X_train, X_test, y_train_raw, y_test_raw = train_test_split(
    X_unique, y_unique_raw, test_size=0.2, random_state=42, stratify=y_unique_raw
)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Columns: 133 entries, itching to prognosis
dtypes: int64(132), object(1)
memory usage: 5.0+ MB


In [None]:
overlap = X_train.merge(X_test, how='inner')
print(len(overlap))


0


In [None]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform([[label] for label in y_train_raw])
y_test = mlb.transform([[label] for label in y_test_raw])


In [None]:
base_clf = RandomForestClassifier()
multi_label_clf = MultiOutputClassifier(base_clf)
multi_label_clf.fit(X_train, y_train)


In [None]:
import joblib

joblib.dump((multi_label_clf, mlb), 'dup_model.pkl')

['dup_model.pkl']

In [None]:
y_pred = multi_label_clf.predict(X_test)


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Hamming Loss:", hamming_loss(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='micro'))
print("Recall:", recall_score(y_test, y_pred, average='micro'))
print("F1 Score:", f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.9672131147540983
Hamming Loss: 0.0007996801279488205
Precision: 1.0
Recall: 0.9672131147540983
F1 Score: 0.9833333333333333


In [None]:
user_symptoms = ['sweating']
input_vector = [0] * len(X.columns)
for symptom in user_symptoms:
    if symptom in X.columns:
        input_vector[X.columns.get_loc(symptom)] = 1
probas = multi_label_clf.predict_proba([input_vector])

# probas is a list of arrays (one per disease)


predicted_diseases = []

for i, disease in enumerate(mlb.classes_):
    p = probas[i][0][1]  # Get probability for class 1 (positive)
    if p >= 0.1:
        predicted_diseases.append((disease, p))



In [None]:
print(predicted_diseases)

[('Heart attack', np.float64(0.32))]


In [None]:
possible_diseases = df[
    (df['sweating'] == 1)
]['prognosis'].unique()
print(possible_diseases)

['Malaria' 'Tuberculosis' 'Pneumonia' 'Heart attack' 'Hyperthyroidism'
 'Hypoglycemia']


# **Testing**

In [None]:
test_df = pd.read_csv('Testing.csv')


In [None]:
X_test_external = test_df.drop('prognosis', axis=1)
y_test_raw_external = test_df['prognosis']


In [None]:
y_test_external = mlb.transform([[label] for label in y_test_raw_external])


In [None]:
y_pred_external = multi_label_clf.predict(X_test_external)


In [None]:
from sklearn.metrics import accuracy_score, hamming_loss, f1_score

# Subset accuracy
subset_acc = (y_pred_external == y_test_external).all(axis=1).mean()
print("External Subset Accuracy:", subset_acc)

# Hamming loss
print("External Hamming Loss:", hamming_loss(y_test_external, y_pred_external))

# Micro/Macro F1
print("External Micro F1:", f1_score(y_test_external, y_pred_external, average='micro'))
print("External Macro F1:", f1_score(y_test_external, y_pred_external, average='macro'))


External Subset Accuracy: 0.9761904761904762
External Hamming Loss: 0.0005807200929152149
External Micro F1: 0.9879518072289156
External Macro F1: 0.991869918699187
