In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
# Load the data into a Pandas dataframe
data = pd.read_csv("Allopathic Dataset.csv")
symptoms_list = [symptom.lower().replace(' ', '') for symptom in data.columns.tolist() if symptom != 'Medicines']

# Split the data into input features (symptoms) and target variable (Medicines)
X = data.drop("Medicines", axis=1)
y = data["Medicines"]

FileNotFoundError: [Errno 2] No such file or directory: 'Allopathic Dataset.csv'

In [None]:
# Encode the categorical target variable `Medicines` into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
label_map = {label: i for i, label in enumerate(label_encoder.classes_)}
print('Label Map:', label_map)

# print a table of labels and their corresponding encoded values
print('Label\tEncoded Value')
for label, value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print('{}\t{}'.format(label, value))

Label Map: {'Aspirin': 0, 'Cetirizine': 1, 'Diphenhydramine': 2, 'Esomeprazole': 3, 'Fluticasone': 4, 'Ibuprofen': 5, 'Montelukast': 6, 'Naproxen': 7, 'Omeprazole': 8, 'Paracetamol': 9, 'Ranitidine': 10}
Label	Encoded Value
Aspirin	0
Cetirizine	1
Diphenhydramine	2
Esomeprazole	3
Fluticasone	4
Ibuprofen	5
Montelukast	6
Naproxen	7
Omeprazole	8
Paracetamol	9
Ranitidine	10


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate a DecisionTreeClassifier algorithm
model_DT = DecisionTreeClassifier(random_state=42)
model_RF = RandomForestClassifier(random_state=42)
model_KNN = KNeighborsClassifier(n_neighbors=3)

# Train the model on the training set
model_DT.fit(X_train, y_train.ravel())
model_RF.fit(X_train, y_train.ravel())
model_KNN.fit(X_train, y_train.ravel())

# Test the model on the testing set
y_pred_DT = model_DT.predict(X_test)
y_pred_RF = model_RF.predict(X_test)
y_pred_KNN = model_KNN.predict(X_test)

In [None]:
# pd.DataFrame(y_train).hist(bins = 40)
# plt.show()

In [None]:
# pd.DataFrame(y_test).hist(bins = 40)
# plt.show()

In [None]:
models = [model_DT, model_RF, model_KNN]
model_data = {model_DT : y_pred_DT, model_RF : y_pred_RF, model_KNN : y_pred_KNN}

for model in models:
    print("\nTrain Accuracy of Model {} = {}".format(model, sklearn.metrics.accuracy_score(y_train, model.predict(X_train))))
    print("Test Accuracy of Model {} = {}".format(model, sklearn.metrics.accuracy_score(y_test, model_data[model])))


Train Accuracy of Model DecisionTreeClassifier(random_state=42) = 0.5833333333333334
Test Accuracy of Model DecisionTreeClassifier(random_state=42) = 0.0

Train Accuracy of Model RandomForestClassifier(random_state=42) = 0.5833333333333334
Test Accuracy of Model RandomForestClassifier(random_state=42) = 0.10526315789473684

Train Accuracy of Model KNeighborsClassifier(n_neighbors=3) = 0.3194444444444444
Test Accuracy of Model KNeighborsClassifier(n_neighbors=3) = 0.05263157894736842


In [None]:
# model_DT.score(X_test, y_test)

0.1

In [None]:
# model.score(X_train, y_train)

0.3194444444444444

In [None]:
# sklearn.metrics.accuracy_score(y_train, model.predict(X_train))

0.5555555555555556

In [None]:
# Use the trained model to make predictions on new input symptoms
new = input("Enter symptoms separated by comma: ")
new = new.lower().replace(" ", "")
new_symptoms = [1 if symptom.strip() in new.split(',') else 0 for symptom in symptoms_list]

print(new_symptoms)
# print(len(new_symptoms))

# Predict the probabilities of each class using the `predict_proba()` method
predicted_probabilities_DT = model_DT.predict_proba([new_symptoms])[0]
predicted_probabilities_RF = model_RF.predict_proba([new_symptoms])[0]
predicted_probabilities_KNN = model_KNN.predict_proba([new_symptoms])[0]

# Sort the probabilities in descending order and get the top 3 predictions
top_3_predictions_DT = label_encoder.inverse_transform(predicted_probabilities_DT.argsort()[::-1][:3])
top_3_predictions_RF = label_encoder.inverse_transform(predicted_probabilities_RF.argsort()[::-1][:3])
top_3_predictions_KNN = label_encoder.inverse_transform(predicted_probabilities_KNN.argsort()[::-1][:3])


print("Top 3 predicted medicines - Decision Tree:", top_3_predictions_DT)
print("Top 3 predicted medicines - Random Forest:", top_3_predictions_RF)
print("Top 3 predicted medicines - KNN:", top_3_predictions_KNN)



Enter symptoms separated by comma: Sore Throat,Muscle Aches,Runny Nose,Sneezing,Nausea
[0, 0, 0, 1, 0, 1, 1, 1, 1, 0]
Top 3 predicted medicines - Decision Tree: ['Cetirizine' 'Ranitidine' 'Paracetamol']
Top 3 predicted medicines - Random Forest: ['Montelukast' 'Fluticasone' 'Ibuprofen']
Top 3 predicted medicines - KNN: ['Montelukast' 'Ibuprofen' 'Fluticasone']




In [None]:
model.predict()

In [None]:
print(label_encoder.classes_)

['Aspirin' 'Cetirizine' 'Diphenhydramine' 'Esomeprazole' 'Fluticasone'
 'Ibuprofen' 'Montelukast' 'Naproxen' 'Omeprazole' 'Paracetamol'
 'Ranitidine']


In [None]:
X_train

Unnamed: 0,Headache,Fever,Cough,Sore Throat,Fatigue,Muscle Aches,Runny Nose,Sneezing,Nausea,Vomiting
64,1,0,0,1,1,0,0,0,0,1
15,1,0,0,1,0,1,1,0,1,0
67,0,0,1,0,1,0,1,0,1,0
77,0,1,0,0,1,1,0,0,0,0
30,1,1,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
20,1,1,1,0,0,0,1,0,0,0
60,1,1,0,0,0,1,1,0,0,1
71,0,1,1,1,0,0,0,1,1,0
14,0,0,0,1,1,0,0,1,1,1


In [None]:
X_test

Unnamed: 0,Headache,Fever,Cough,Sore Throat,Fatigue,Muscle Aches,Runny Nose,Sneezing,Nausea,Vomiting
40,0,0,0,1,0,1,1,1,1,0
22,0,1,1,1,1,0,0,1,0,0
55,0,1,1,0,0,1,1,1,0,0
88,0,0,0,1,1,1,1,0,0,0
0,1,1,0,0,1,1,0,0,0,0
26,0,0,1,1,0,1,1,0,0,1
39,1,1,0,1,1,1,0,0,1,0
66,1,0,1,0,0,1,1,1,0,0
10,1,1,1,0,0,0,1,0,0,0
44,0,1,1,1,1,0,1,0,0,0


In [None]:
model.predict(X_test.iloc[0,:].values.reshape(1, -1))



array([1])

In [None]:
y_test[0]

7

In [None]:
y_train

array([ 4,  5,  4,  8,  2,  2,  5,  4,  4,  2,  8,  9,  4,  9,  7,  1,  5,
        1,  2,  9,  8,  0,  0,  8,  0,  6,  5,  0,  9,  5,  7,  9,  8,  9,
        3,  2,  8,  6,  3,  6,  1, 10,  1,  3,  6,  3,  8,  7,  6,  1,  8,
        6,  2,  3,  6,  4,  8,  7,  2,  9, 10,  0,  9,  0,  3,  8,  3,  0,
        6,  4,  5, 10])