# There're some twists in this notebook, I tried to make it as beginner-friendly as possible. 
# If you find yourself feeling lost at any point, don't worry. Take a break, revisit the previous lines, and continue when you're ready.

In [1]:
import regex as re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

2024-03-08 05:58:35.522363: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 05:58:35.522510: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 05:58:35.651269: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## First we load and view the data

In [2]:
main_data = pd.read_csv("/kaggle/input/disease-symptom-description-dataset/dataset.csv")

In [3]:
main_data.head(10)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
5,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
6,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
7,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
8,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
9,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,


In [4]:
main_data.sample(5)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
3294,Jaundice,itching,vomiting,fatigue,weight_loss,high_fever,yellowish_skin,dark_urine,abdominal_pain,,,,,,,,,
2611,Dimorphic hemmorhoids(piles),constipation,pain_during_bowel_movements,pain_in_anal_region,bloody_stool,irritation_in_anus,,,,,,,,,,,,
891,Diabetes,fatigue,weight_loss,restlessness,lethargy,irregular_sugar_level,blurred_and_distorted_vision,obesity,excessive_hunger,increased_appetite,polyuria,,,,,,,
1232,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
1412,Typhoid,chills,vomiting,fatigue,high_fever,headache,nausea,abdominal_pain,diarrhoea,toxic_look_(typhos),belly_pain,,,,,,,


In [5]:
main_data.shape

(4920, 18)

## Let's check if the data is balanced or not

In [6]:
main_data.Disease.value_counts()

Disease
Fungal infection                           120
Allergy                                    120
GERD                                       120
Chronic cholestasis                        120
Drug Reaction                              120
Peptic ulcer diseae                        120
AIDS                                       120
Diabetes                                   120
Gastroenteritis                            120
Bronchial Asthma                           120
Hypertension                               120
Migraine                                   120
Cervical spondylosis                       120
Paralysis (brain hemorrhage)               120
Jaundice                                   120
Malaria                                    120
Chicken pox                                120
Dengue                                     120
Typhoid                                    120
hepatitis A                                120
Hepatitis B                                120
Hepat

## There's two approaches to handle this type of data
###  Label_Encoding
###  One-Hot-Encoding-Style
## and we are going to discuss both

# Label_Encoding

#### We are going to label encode the Disease column first, then the rest

In [7]:
df = main_data.copy() # We take a copy of the original data incase we needed the original data later
df.dropna(axis=1, how='all', inplace=True) # Dropping rows which are all NaN
df.fillna(0, inplace=True)                 # Replacing the NaN with 0

# Creating a custom label encoder so we can specify which number the encoding starts from
class CustomLabelEncoder(LabelEncoder):
    def __init__(self, start=0):
        self.start = start
        super().__init__()

    def fit_transform(self, y):
        encoded = super().fit_transform(y)
        encoded += self.start
        return encoded

# Flatten the 'Disease' column into a single Series
flattened_series = df['Disease'].astype(str)

# Create and fit label encoder for the 'Disease' column
encoder = CustomLabelEncoder(start=200) # Here we tell the label encoder to start encoding from 200

*Why?* you might ask
Because if we just imported and fitted the usual label encoder, it will start indexing from 0.
*So?*
In the next step, we will label encoding the **rest** of the data, and that encoder will start from 0 to 131.
So we are trying to prevent different values from getting encoding the same way.

*BUT WHY ARE WE DOING THEM SEPARATLY?!* you might ask.
When I first wrote the code I thought this way would be easier than just encoding
the entire dataset, then separate the features from the targets in the label_mapping dictionary.

If you find this was complicated or impractical, that's okay, just label_encode the entire data then seperate the features from the labels. The end result will be the same: converting string into int

In [8]:
encoded_values = encoder.fit_transform(flattened_series)
df['Disease'] = encoded_values

mapping_data = {'label_encoder': encoder}

# Saving the mapping of the label column "Disease" to use later
label_mapping = {k: v for k, v in zip(mapping_data['label_encoder'].classes_, range(200, 200+len(mapping_data['label_encoder'].classes_)))}

df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,215,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,215,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,215,itching,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,215,itching,skin_rash,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,215,itching,skin_rash,nodal_skin_eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
label_mapping

{'(vertigo) Paroymsal  Positional Vertigo': 200,
 'AIDS': 201,
 'Acne': 202,
 'Alcoholic hepatitis': 203,
 'Allergy': 204,
 'Arthritis': 205,
 'Bronchial Asthma': 206,
 'Cervical spondylosis': 207,
 'Chicken pox': 208,
 'Chronic cholestasis': 209,
 'Common Cold': 210,
 'Dengue': 211,
 'Diabetes ': 212,
 'Dimorphic hemmorhoids(piles)': 213,
 'Drug Reaction': 214,
 'Fungal infection': 215,
 'GERD': 216,
 'Gastroenteritis': 217,
 'Heart attack': 218,
 'Hepatitis B': 219,
 'Hepatitis C': 220,
 'Hepatitis D': 221,
 'Hepatitis E': 222,
 'Hypertension ': 223,
 'Hyperthyroidism': 224,
 'Hypoglycemia': 225,
 'Hypothyroidism': 226,
 'Impetigo': 227,
 'Jaundice': 228,
 'Malaria': 229,
 'Migraine': 230,
 'Osteoarthristis': 231,
 'Paralysis (brain hemorrhage)': 232,
 'Peptic ulcer diseae': 233,
 'Pneumonia': 234,
 'Psoriasis': 235,
 'Tuberculosis': 236,
 'Typhoid': 237,
 'Urinary tract infection': 238,
 'Varicose veins': 239,
 'hepatitis A': 240}

#### Now we are going to use the label encoder to encode the rest of the data

In [10]:
# Stack the entire data into a single Series.
# We are stacking the entire data because there're similar values in different columns. **REMEMBER THIS**
encode_df = df.copy() # Again, taking a copy because we might need the original later.
encode_df = encode_df.drop(["Disease"], axis = 1)
flattened_series = encode_df.stack().astype(str)

# Create and fit label encoder.
encoder = LabelEncoder()
encoded_values = encoder.fit_transform(flattened_series)

# Reshape the encoded values back to the original DataFrame shape.
F_encoded_df = pd.DataFrame(encoded_values.reshape(encode_df.shape), columns=encode_df.columns,
                            index=encode_df.index)

# Store the mapping data for future use
Fmapping_data = {'label_encoder': encoder}
feature_mapping = {k: v for k, v in zip(Fmapping_data['label_encoder'].classes_, 
                                        Fmapping_data['label_encoder'].\
                                        transform(Fmapping_data['label_encoder'].classes_))}
F_encoded_df.head(3)

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,131,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130
1,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
2,131,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130


In [11]:
feature_mapping

{' abdominal_pain': 0,
 ' abnormal_menstruation': 1,
 ' acidity': 2,
 ' acute_liver_failure': 3,
 ' altered_sensorium': 4,
 ' anxiety': 5,
 ' back_pain': 6,
 ' belly_pain': 7,
 ' blackheads': 8,
 ' bladder_discomfort': 9,
 ' blister': 10,
 ' blood_in_sputum': 11,
 ' bloody_stool': 12,
 ' blurred_and_distorted_vision': 13,
 ' breathlessness': 14,
 ' brittle_nails': 15,
 ' bruising': 16,
 ' burning_micturition': 17,
 ' chest_pain': 18,
 ' chills': 19,
 ' cold_hands_and_feets': 20,
 ' coma': 21,
 ' congestion': 22,
 ' constipation': 23,
 ' continuous_feel_of_urine': 24,
 ' continuous_sneezing': 25,
 ' cough': 26,
 ' cramps': 27,
 ' dark_urine': 28,
 ' dehydration': 29,
 ' depression': 30,
 ' diarrhoea': 31,
 ' dischromic _patches': 32,
 ' distention_of_abdomen': 33,
 ' dizziness': 34,
 ' drying_and_tingling_lips': 35,
 ' enlarged_thyroid': 36,
 ' excessive_hunger': 37,
 ' extra_marital_contacts': 38,
 ' family_history': 39,
 ' fast_heart_rate': 40,
 ' fatigue': 41,
 ' fluid_overload': 42,

In [12]:
label_encoded_df = pd.concat([df['Disease'], F_encoded_df], axis = 1)
label_encoded_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,215,131,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130
1,215,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
2,215,131,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
3,215,131,99,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
4,215,131,99,72,130,130,130,130,130,130,130,130,130,130,130,130,130,130


#### So now we have a dataset called **label_encoded_df** that has the same data as **main_data** dataset but label-encoded.
#### And we saved the mapping of the target column in a dict called *label_mapping*, and the mapping of the features in a dict called *feature_mapping*.

### Let's create and compile the model

In [13]:
# Creating X and y
model_features = label_encoded_df.columns.tolist()
model_features.remove("Disease")
X = label_encoded_df[model_features]
y = label_encoded_df["Disease"]

In [14]:
# One_hot_encoding the y column to use it as a multicalss in the model output layer
y_encoded = pd.get_dummies(y)
y_encoded.shape

(4920, 41)

In [15]:
# The column names are the mapping of the target column. **REMEMBER THIS**
y_encoded.head()

Unnamed: 0,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


#### We can't use the StandardScaler in the same manner like we usually do because this dataset has reccurenting, similar values in different columns, and StandardScaler apply the scaling column-wise.
#### So as we did earlier with the label_encoder when encoding the features, we are going to scale the entire X all at once.

In [16]:
# Reshape the data
X_reshaped = X.values.reshape(-1, 1)
scaler = StandardScaler().fit(X_reshaped)
X_scaled_reshaped = scaler.transform(X_reshaped)
# Reshape back to original shape
X_scaled = X_scaled_reshaped.reshape(X.shape)
X_df = pd.DataFrame(X_scaled)
X_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.720141,-0.051554,-0.702672,-1.667292,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026
1,-0.051554,-0.702672,-1.667292,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026
2,0.720141,-0.702672,-1.667292,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026
3,0.720141,-0.051554,-1.667292,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026
4,0.720141,-0.051554,-0.702672,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026


#### As you can see the NaN values that were encoded to 0 are now ALL scaled to 0.696026. If we applied the StandardScaler as we normally do, these 0 values in different columns would have different values after scaling.

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_encoded, test_size = 0.25, random_state=42)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [18]:
X_train_tensor = tf.convert_to_tensor(X_train.values, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test.values, dtype=tf.float32)
X_eval_tensor = tf.convert_to_tensor(X_eval.values, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float64)
y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float64)
y_eval_tensor = tf.convert_to_tensor(y_eval, dtype=tf.float64)

In [19]:
X_train_tensor

<tf.Tensor: shape=(3690, 17), dtype=float32, numpy=
array([[ 0.67191035, -0.7509033 , -1.0161736 , ...,  0.69602585,
         0.69602585,  0.69602585],
       [ 0.72014135, -1.4502524 , -1.0402892 , ...,  0.69602585,
         0.69602585,  0.69602585],
       [ 0.454871  ,  0.14136969, -1.7396382 , ...,  0.69602585,
         0.69602585,  0.69602585],
       ...,
       [-0.0515542 , -1.980793  , -1.1126356 , ...,  0.69602585,
         0.69602585,  0.69602585],
       [ 0.72014135, -0.0515542 , -0.7026723 , ...,  0.69602585,
         0.69602585,  0.69602585],
       [ 0.72014135, -0.0515542 ,  0.1172542 , ...,  0.69602585,
         0.69602585,  0.69602585]], dtype=float32)>

In [20]:
y_train_tensor

<tf.Tensor: shape=(3690, 41), dtype=float64, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])>

In [21]:
with tf.device('/GPU:0'):
    model_1 = keras.Sequential([
        layers.Input(shape=(X_train_tensor.shape[1],)),
        layers.Dense(32, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(128, activation='tanh'),
        layers.BatchNormalization(),
        layers.Dense(128, activation='tanh'),
        layers.Dropout(0.1),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(y_train_tensor.shape[1], activation='softmax')])
    
    model_1.compile(optimizer= 'adam', loss='categorical_crossentropy', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=4, mode='max')
    history = model_1.fit(X_train_tensor, y_train_tensor, epochs=500, callbacks=[early_stopping],
                batch_size=16, validation_data=(X_eval_tensor, y_eval_tensor))

Epoch 1/500


I0000 00:00:1709877533.123715      69 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500


In [22]:
model_1.evaluate(X_test_tensor, y_test_tensor)



[0.0255840215831995, 0.9804878234863281]

### Looks great so far...99% accuracy
## LET'S TEST IT MANUALLY

In [23]:
def encode_user_input(user_input, mapping=feature_mapping):
    '''
    This function takes user input and transform it to the same encoding 
    the original data, which the model was trained on, has.

    Args:
        user_input (str): The user input.
        mapping (dict): The mapping the label_encoder used earlier.

    Returns:
        str: encoded user input.
    '''
    encoded_input = []
    for symptom in user_input:
        for key in mapping.keys():
            if symptom.strip().lower() == key.strip().lower():
                encoded_input.append(mapping[key])
                break  # Break out of inner loop if a match is found
    return encoded_input

In [24]:
# let's take a random row from the original data.
user_input = ['itching','skin_rash','nodal_skin_eruptions','dischromic _patches']
# This row should result in "Fungal infection".
encoded_input = encode_user_input(user_input)
encoded_input

[131, 99, 72, 32]

In [25]:
# Transforming the encoded user input to a tensor.
input_tensor = tf.cast(encoded_input, tf.float32)
input_tensor

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([131.,  99.,  72.,  32.], dtype=float32)>

In [26]:
# Checking the number of dimensions.
input_tensor.ndim == X_train_tensor[1].ndim

True

### Let's check if the data is encoded in the same way the original data was

In [27]:
label_encoded_df.iloc[0][1:5]

Symptom_1    131
Symptom_2     99
Symptom_3     72
Symptom_4     32
Name: 0, dtype: int64

### Great, here is the entire row

In [28]:
label_encoded_df.head(1)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,215,131,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130


### itching, skin_rash,..., then all NaN, or 130 after the label_encoding. So we need to 'pad' the user input to match the original data

In [29]:
padding_value = tf.constant(130, dtype=tf.float32)
desired_length = X_train_tensor[1].shape[0]
padding_length = desired_length - tf.shape(input_tensor)[0]
padding_tensor = tf.fill((padding_length,), padding_value)
final_input = tf.concat([input_tensor, padding_tensor], axis=0)
final_input

<tf.Tensor: shape=(17,), dtype=float32, numpy=
array([131.,  99.,  72.,  32., 130., 130., 130., 130., 130., 130., 130.,
       130., 130., 130., 130., 130., 130.], dtype=float32)>

In [30]:
target_index = y_encoded.columns.tolist() # If you remember, the column names after the one-hot-encoding ARE the mapping of the target values.

### Scaling the user input:

In [31]:
final_array = final_input.numpy()
final_reshaped = final_array.reshape(-1, 1)
X_scaled = scaler.transform(final_reshaped)
final_tensor = tf.convert_to_tensor(X_scaled)
final_tensor = tf.squeeze(final_tensor)
final_tensor

<tf.Tensor: shape=(17,), dtype=float32, numpy=
array([ 0.72014135, -0.05155421, -0.7026723 , -1.6672916 ,  0.69602585,
        0.69602585,  0.69602585,  0.69602585,  0.69602585,  0.69602585,
        0.69602585,  0.69602585,  0.69602585,  0.69602585,  0.69602585,
        0.69602585,  0.69602585], dtype=float32)>

In [32]:
X_df.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.720141,-0.051554,-0.702672,-1.667292,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026,0.696026


### And finally using the trained model to predict the user input

In [33]:
import numpy as np
predict_proba = model_1.predict(tf.expand_dims(final_input, axis = 0)) # Expanding dims to get (1,17)
predicted_class_index = np.argmax(predict_proba) # Getting the 'index' of our prediction
prediction_encode = target_index[predicted_class_index] # Getting to mapping of that 'index' using y column names
inverse_label_encoding = {v: k for k, v in label_mapping.items()} # Inverse the label encoding
prediction = inverse_label_encoding[prediction_encode]
prediction



'Paralysis (brain hemorrhage)'

### This should've been 'Fungal infection'.
### Although getting 99% accuracy, looks like our model behaves poorly...

## Let's try another approach

# One-Hot-Encoding-style

## As you remember, this is how our original data looks like

In [34]:
main_data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


## Let's try to "one-hot-encode" this

In [35]:
df = main_data.copy() # As usual, taking a copy from that data incase we needed the original later
# Combine all symptom columns into a single column
df['All Symptoms'] = df.apply(lambda row: ','.join(row.dropna()), axis=1)
# Drop duplicate symptoms within each cell
df['All Symptoms'] = df['All Symptoms'].apply(lambda x: ','.join(sorted(set(x.split(','))) if x else ''))
stay_cols= ['Disease', 'All Symptoms']
df = df[stay_cols]
df.head()

Unnamed: 0,Disease,All Symptoms
0,Fungal infection,"dischromic _patches, nodal_skin_eruptions, sk..."
1,Fungal infection,"dischromic _patches, nodal_skin_eruptions, sk..."
2,Fungal infection,"dischromic _patches, nodal_skin_eruptions,Fun..."
3,Fungal infection,"dischromic _patches, skin_rash,Fungal infecti..."
4,Fungal infection,"nodal_skin_eruptions, skin_rash,Fungal infect..."


In [36]:
df['All Symptoms'][0]

' dischromic _patches, nodal_skin_eruptions, skin_rash,Fungal infection,itching'

### Great, let's also remove the '_'
#### if you notice, there's a "Fungal infection" in that row. We will fix that later

In [37]:
def strip_to_basic_tokens(text):
    # Remove doble spaces and underscores
    text = re.sub(r'[_\s]+', ' ', text)
    # Split by commas and lowercase the tokens
    tokens = [token.strip().lower() for token in text.split(',')]
    return tokens

# Apply the function to 'All Symptoms' column
df['Basic Tokens'] = df['All Symptoms'].apply(strip_to_basic_tokens)
df['Basic Tokens'] = df['Basic Tokens'].apply(lambda x: ', '.join(x))
df = df.drop(['All Symptoms'], axis = 1)
df.head()

Unnamed: 0,Disease,Basic Tokens
0,Fungal infection,"dischromic patches, nodal skin eruptions, skin..."
1,Fungal infection,"dischromic patches, nodal skin eruptions, skin..."
2,Fungal infection,"dischromic patches, nodal skin eruptions, fung..."
3,Fungal infection,"dischromic patches, skin rash, fungal infectio..."
4,Fungal infection,"nodal skin eruptions, skin rash, fungal infect..."


In [38]:
df['Basic Tokens'][0]

'dischromic patches, nodal skin eruptions, skin rash, fungal infection, itching'

### Looking good, now let's "one-hot-encode" it using Multi-Label Binarizer

In [39]:
dfE = df.copy() # Taking a copy because we never know what might happen
dfE['Basic Tokens'] = dfE['Basic Tokens'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
# Fit and transform the 'Basic Tokens' column
one_hot_encoded = pd.DataFrame(mlb.fit_transform(dfE['Basic Tokens']), columns=mlb.classes_, index=df.index)

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([dfE, one_hot_encoded], axis=1)

# Drop the 'Basic Tokens' column
df_encoded = df_encoded.drop(columns=['Basic Tokens'])
df_encoded.head()

Unnamed: 0,Disease,(vertigo) paroymsal positional vertigo,abdominal pain,abnormal menstruation,acidity,acne,acute liver failure,aids,alcoholic hepatitis,allergy,altered sensorium,anxiety,arthritis,back pain,belly pain,blackheads,bladder discomfort,blister,blood in sputum,bloody stool,blurred and distorted vision,breathlessness,brittle nails,bronchial asthma,bruising,burning micturition,cervical spondylosis,chest pain,chicken pox,chills,chronic cholestasis,cold hands and feets,coma,common cold,congestion,constipation,continuous feel of urine,continuous sneezing,cough,cramps,dark urine,dehydration,dengue,depression,diabetes,diarrhoea,dimorphic hemmorhoids(piles),dischromic patches,distention of abdomen,dizziness,drug reaction,drying and tingling lips,enlarged thyroid,excessive hunger,extra marital contacts,family history,fast heart rate,fatigue,fluid overload,foul smell of urine,fungal infection,gastroenteritis,gerd,headache,heart attack,hepatitis a,hepatitis b,hepatitis c,hepatitis d,hepatitis e,high fever,hip joint pain,history of alcohol consumption,hypertension,hyperthyroidism,hypoglycemia,hypothyroidism,impetigo,increased appetite,indigestion,inflammatory nails,internal itching,irregular sugar level,irritability,irritation in anus,itching,jaundice,joint pain,knee pain,lack of concentration,lethargy,loss of appetite,loss of balance,loss of smell,malaise,malaria,migraine,mild fever,mood swings,movement stiffness,mucoid sputum,muscle pain,muscle wasting,muscle weakness,nausea,neck pain,nodal skin eruptions,obesity,osteoarthristis,pain behind the eyes,pain during bowel movements,pain in anal region,painful walking,palpitations,paralysis (brain hemorrhage),passage of gases,patches in throat,peptic ulcer diseae,phlegm,pneumonia,polyuria,prominent veins on calf,psoriasis,puffy face and eyes,pus filled pimples,receiving blood transfusion,receiving unsterile injections,red sore around nose,red spots over body,redness of eyes,restlessness,runny nose,rusty sputum,scurring,shivering,silver like dusting,sinus pressure,skin peeling,skin rash,slurred speech,small dents in nails,spinning movements,spotting urination,stiff neck,stomach bleeding,stomach pain,sunken eyes,sweating,swelled lymph nodes,swelling joints,swelling of stomach,swollen blood vessels,swollen extremeties,swollen legs,throat irritation,toxic look (typhos),tuberculosis,typhoid,ulcers on tongue,unsteadiness,urinary tract infection,varicose veins,visual disturbances,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
df_encoded.shape

(4920, 173)

### Now let's drop the diseases column values that got encoded in the column names:

In [41]:
disease_names = [key for key in label_mapping.keys()]
diseases = [strip_to_basic_tokens(disease) for disease in disease_names]
diseases_cleaned = [item[0] if isinstance(item, list) else item for item in diseases]
df_encoded = df_encoded.drop(diseases_cleaned, axis = 1)
df_encoded.shape

(4920, 132)

## Now we will create and compile a model the same way we did earlier

In [42]:
model_features = df_encoded.columns.tolist()
model_features.remove("Disease")
X = df_encoded[model_features]
y = df_encoded["Disease"]

In [43]:
y_encoded = pd.get_dummies(y)
y_encoded.shape

(4920, 41)

In [44]:
y_encoded.head()

Unnamed: 0,(vertigo) Paroymsal Positional Vertigo,AIDS,Acne,Alcoholic hepatitis,Allergy,Arthritis,Bronchial Asthma,Cervical spondylosis,Chicken pox,Chronic cholestasis,Common Cold,Dengue,Diabetes,Dimorphic hemmorhoids(piles),Drug Reaction,Fungal infection,GERD,Gastroenteritis,Heart attack,Hepatitis B,Hepatitis C,Hepatitis D,Hepatitis E,Hypertension,Hyperthyroidism,Hypoglycemia,Hypothyroidism,Impetigo,Jaundice,Malaria,Migraine,Osteoarthristis,Paralysis (brain hemorrhage),Peptic ulcer diseae,Pneumonia,Psoriasis,Tuberculosis,Typhoid,Urinary tract infection,Varicose veins,hepatitis A
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.25, random_state=42)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [46]:
X_train_tensor = tf.convert_to_tensor(X_train.values, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test.values, dtype=tf.float32)
X_eval_tensor = tf.convert_to_tensor(X_eval.values, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float64)
y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float64)
y_eval_tensor = tf.convert_to_tensor(y_eval, dtype=tf.float64)

In [47]:
X_train_tensor

<tf.Tensor: shape=(3690, 131), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 1., 1.],
       [1., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [48]:
with tf.device('/GPU:0'):
    model_2 = keras.Sequential([
        layers.Input(shape=(X_train_tensor.shape[1],)),
        layers.Dense(160, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(200, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(240, activation='tanh'),
        layers.BatchNormalization(),
        layers.Dense(240, activation='tanh'),
        layers.Dropout(0.2),
        layers.Dense(200, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(160, activation='relu'),
        layers.Dense(y_train_tensor.shape[1], activation='softmax')])
    
    model_2.compile(optimizer= 'adam', loss='categorical_crossentropy', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=4, mode='max')
    history = model_2.fit(X_train_tensor, y_train_tensor, epochs=500, callbacks=[early_stopping],
                batch_size=16, validation_data=(X_eval_tensor, y_eval_tensor))

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500


In [49]:
model_2.evaluate(X_test_tensor, y_test_tensor)



[3.106420990661718e-05, 1.0]

## Great! 100% accuracy and a 7.9e-04 loss! Now let's test i manually:

In [50]:
# If you remember in the first model, we took a row from the origial data to test the model
# We aren't going to do this here, let's REALLY test it
user_input = ['stomach_pain','acidity','chest_pain'] # This should be GERD

original_data = df_encoded.copy()

# We will change the strip_to_basic_tokens function just a little bit to be able to deal with the user input
def strip_to_basic_tokens(symptoms):
    symptoms = [symptom.strip().lower().replace(' ', '_').replace('_', ' ') for symptom in symptoms]
    return [re.sub(r'\s+', ' ', symptom) for symptom in symptoms]
# Apply strip_to_basic_tokens function to user input
user_input_stripped = strip_to_basic_tokens(user_input)

# Initialize MultiLabelBinarizer with all symptoms
mlb = MultiLabelBinarizer(classes=df_encoded.columns)

# Fit and transform user input
user_input_encoded = pd.DataFrame(mlb.fit_transform([user_input_stripped]), columns=mlb.classes_)

# Concatenate user input with original data
final_user_input = pd.concat([pd.DataFrame(columns=original_data.columns), user_input_encoded], axis=0)
final_user_input = final_user_input.drop(['Disease'],axis = 1)
# Print the final user input shape
final_user_input.head()

Unnamed: 0,abdominal pain,abnormal menstruation,acidity,acute liver failure,altered sensorium,anxiety,back pain,belly pain,blackheads,bladder discomfort,blister,blood in sputum,bloody stool,blurred and distorted vision,breathlessness,brittle nails,bruising,burning micturition,chest pain,chills,cold hands and feets,coma,congestion,constipation,continuous feel of urine,continuous sneezing,cough,cramps,dark urine,dehydration,depression,diarrhoea,dischromic patches,distention of abdomen,dizziness,drying and tingling lips,enlarged thyroid,excessive hunger,extra marital contacts,family history,fast heart rate,fatigue,fluid overload,foul smell of urine,headache,high fever,hip joint pain,history of alcohol consumption,increased appetite,indigestion,inflammatory nails,internal itching,irregular sugar level,irritability,irritation in anus,itching,joint pain,knee pain,lack of concentration,lethargy,loss of appetite,loss of balance,loss of smell,malaise,mild fever,mood swings,movement stiffness,mucoid sputum,muscle pain,muscle wasting,muscle weakness,nausea,neck pain,nodal skin eruptions,obesity,pain behind the eyes,pain during bowel movements,pain in anal region,painful walking,palpitations,passage of gases,patches in throat,phlegm,polyuria,prominent veins on calf,puffy face and eyes,pus filled pimples,receiving blood transfusion,receiving unsterile injections,red sore around nose,red spots over body,redness of eyes,restlessness,runny nose,rusty sputum,scurring,shivering,silver like dusting,sinus pressure,skin peeling,skin rash,slurred speech,small dents in nails,spinning movements,spotting urination,stiff neck,stomach bleeding,stomach pain,sunken eyes,sweating,swelled lymph nodes,swelling joints,swelling of stomach,swollen blood vessels,swollen extremeties,swollen legs,throat irritation,toxic look (typhos),ulcers on tongue,unsteadiness,visual disturbances,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Great! Now the user input looks exactly like the df_encoded data.

In [51]:
user_tensor = tf.convert_to_tensor(final_user_input.values, dtype=tf.float32)
user_tensor[0]

<tf.Tensor: shape=(131,), dtype=float32, numpy=
array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

### After converting the user input to a tensor, we'll utilize the model to predict the disease the user may have:

In [52]:
predict_proba = model_2.predict(user_tensor)
predicted_class_index = np.argmax(predict_proba)
prediction_encode = target_index[predicted_class_index]
inverse_label_encoding = {v: k for k, v in label_mapping.items()}
prediction = inverse_label_encoding[prediction_encode]
prediction



'GERD'

## WOOHOOO! The model is performing as expected. A 100% accuracy model

### Let's test it again

In [53]:
user_input = ['continuous_sneezing','watering_from_eyes'] # This should be Allergy

original_data = df_encoded.copy()

# Apply strip_to_basic_tokens function to user input
user_input_stripped = strip_to_basic_tokens(user_input)

# Fit and transform user input
user_input_encoded = pd.DataFrame(mlb.fit_transform([user_input_stripped]), columns=mlb.classes_)

# Concatenate user input with original data
final_user_input = pd.concat([pd.DataFrame(columns=original_data.columns), user_input_encoded], axis=0)
final_user_input = final_user_input.drop(['Disease'],axis = 1)
# Print the final user input shape
final_user_input.head()

Unnamed: 0,abdominal pain,abnormal menstruation,acidity,acute liver failure,altered sensorium,anxiety,back pain,belly pain,blackheads,bladder discomfort,blister,blood in sputum,bloody stool,blurred and distorted vision,breathlessness,brittle nails,bruising,burning micturition,chest pain,chills,cold hands and feets,coma,congestion,constipation,continuous feel of urine,continuous sneezing,cough,cramps,dark urine,dehydration,depression,diarrhoea,dischromic patches,distention of abdomen,dizziness,drying and tingling lips,enlarged thyroid,excessive hunger,extra marital contacts,family history,fast heart rate,fatigue,fluid overload,foul smell of urine,headache,high fever,hip joint pain,history of alcohol consumption,increased appetite,indigestion,inflammatory nails,internal itching,irregular sugar level,irritability,irritation in anus,itching,joint pain,knee pain,lack of concentration,lethargy,loss of appetite,loss of balance,loss of smell,malaise,mild fever,mood swings,movement stiffness,mucoid sputum,muscle pain,muscle wasting,muscle weakness,nausea,neck pain,nodal skin eruptions,obesity,pain behind the eyes,pain during bowel movements,pain in anal region,painful walking,palpitations,passage of gases,patches in throat,phlegm,polyuria,prominent veins on calf,puffy face and eyes,pus filled pimples,receiving blood transfusion,receiving unsterile injections,red sore around nose,red spots over body,redness of eyes,restlessness,runny nose,rusty sputum,scurring,shivering,silver like dusting,sinus pressure,skin peeling,skin rash,slurred speech,small dents in nails,spinning movements,spotting urination,stiff neck,stomach bleeding,stomach pain,sunken eyes,sweating,swelled lymph nodes,swelling joints,swelling of stomach,swollen blood vessels,swollen extremeties,swollen legs,throat irritation,toxic look (typhos),ulcers on tongue,unsteadiness,visual disturbances,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [54]:
user_tensor = tf.convert_to_tensor(final_user_input.values, dtype=tf.float32)
user_tensor[0]

<tf.Tensor: shape=(131,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

In [55]:
predict_proba = model_2.predict(user_tensor)
predicted_class_index = np.argmax(predict_proba)
prediction_encode = target_index[predicted_class_index]
inverse_label_encoding = {v: k for k, v in label_mapping.items()}
prediction = inverse_label_encoding[prediction_encode]
prediction



'Allergy'

## So unlike model_1 that was trained on the label_encoded data, model_2 is actually behaving as it should.

*What to do now?* Well, you can test the following:
* Check if the symptom_severity has any significance when applied to the data.
* Try different model architecture.
* Try different approaches to prepare the data.
* Just **have fun**.

## There's a different approach that will be appropriate to use here, NLP.
## But maybe in a future notebook...