In [None]:
!pip install tensorflow==2.12.0
!pip install datasets

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset

In [None]:
# Cargar los datos
dataset = load_dataset('celikmus/mayo_clinic_symptoms_and_diseases_v1')
texts = dataset['train']['text']
labels = dataset['train']['label']

Downloading readme:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/celikmus___parquet/celikmus--mayo_clinic_symptoms_and_diseases_v1-4e51adaf795407cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/626k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1058 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/celikmus___parquet/celikmus--mayo_clinic_symptoms_and_diseases_v1-4e51adaf795407cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

El $\textit{tokenizer}$ se encarga de convertir las frases de entrada en secuencias de palabras codificadas numéricamente. La codificación se basa en la frecuencia de aparición de las palabras en el texto. 

In [None]:
# Preprocesamiento de los datos
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

label_encoder = LabelEncoder()
label_sequences = label_encoder.fit_transform(labels)

vocab_size = len(tokenizer.word_index) + 1  
label_size = len(set(labels))

maxlen = max(len(x) for x in sequences)
sequences = pad_sequences(sequences, maxlen=maxlen)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(sequences, label_sequences, test_size=0.2)


In [None]:
def build_model(vocab_size, label_size, embedding_dim, rnn_units, batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]))
    model.add(Bidirectional(LSTM(rnn_units, return_sequences=False, stateful=False, recurrent_initializer='glorot_uniform')))
    model.add(Dense(label_size, activation='softmax'))
    return model

# Construir el modelo
embedding_dim = 150  
rnn_units = 64*2
batch_size = 32

model = build_model(vocab_size, label_size, embedding_dim, rnn_units, batch_size)

# Compilar el modelo
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Entrenar el modelo
model.fit(X_train, y_train, batch_size=batch_size, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f877e8ee380>

In [None]:
# Evaluar el modelo en los datos de prueba
loss, accuracy = model.evaluate(X_test, y_test)

print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

# Calcular la perplejidad
import numpy as np
perplexity = np.exp(loss)
print(f"Perplexity: {perplexity}")

Loss: 10.713113784790039
Accuracy: 0.0
Perplexity: 44941.358963486084


In [None]:
def predict_disease(model, tokenizer, label_encoder, symptoms_text):
    # Convertir el texto a secuencias
    sequences = tokenizer.texts_to_sequences([symptoms_text])
    
    # Hacer la predicción
    prediction = model.predict(sequences)
    
    # Obtener el índice de la etiqueta de la enfermedad con la mayor probabilidad
    predicted_index = np.argmax(prediction)
    
    # Convertir el índice de la enfermedad a una etiqueta legible
    predicted_label = label_encoder.inverse_transform([predicted_index])
    
    return predicted_label

symptoms_text = "The patient has a swollen mole that is itchy and reddish in color"
predicted_disease = predict_disease(model, tokenizer, label_encoder, symptoms_text)

print(f"The predicted disease is: {predicted_disease[0]}")

The predicted disease is: vaginitis


In [None]:

symptoms_text = "The patient has fever, throat pain and headache."
predicted_disease = predict_disease(model, tokenizer, label_encoder, symptoms_text)

print(f"The predicted disease is: {predicted_disease[0]}")

The predicted disease is: dermatomyositis
