In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier




In [1]:
import pandas as pd

df = pd.read_csv('../data/df_final.csv')
df


Unnamed: 0,Patient_ID,Age,Gender,Sleep_Disorder_Type,AHI_Score,SaO2_Level,OCR_Extracted_Text,Diagnosis_Confirmed,OCR_Text_Limpio,OCR_Text_Tokens,...,disorder,insomnia,leg,narcolepsy,obstructive,patient,restless,sleep,syndrome,with
0,P0001,54,Female,Restless Leg Syndrome,44.11,77.59,Patient P0001 diagnosed with Restless Leg Synd...,1,patient p diagnosed with restless leg syndrome,"['patient', 'p', 'diagnosed', 'with', 'restles...",...,0.000000,0.000000,0.538616,0.0,0.000000,0.207909,0.538616,0.000000,0.538616,0.207909
1,P0002,17,Female,Restless Leg Syndrome,33.55,76.22,Patient P0002 diagnosed with Restless Leg Synd...,1,patient p diagnosed with restless leg syndrome,"['patient', 'p', 'diagnosed', 'with', 'restles...",...,0.000000,0.000000,0.538616,0.0,0.000000,0.207909,0.538616,0.000000,0.538616,0.207909
2,P0003,74,Male,Insomnia,12.91,89.53,Patient P0003 diagnosed with Insomnia.,1,patient p diagnosed with insomnia,"['patient', 'p', 'diagnosed', 'with', 'insomnia']",...,0.000000,0.806607,0.000000,0.0,0.000000,0.341265,0.000000,0.000000,0.000000,0.341265
3,P0004,63,Male,Restless Leg Syndrome,33.73,85.65,Patient P0004 diagnosed with Restless Leg Synd...,1,patient p diagnosed with restless leg syndrome,"['patient', 'p', 'diagnosed', 'with', 'restles...",...,0.000000,0.000000,0.538616,0.0,0.000000,0.207909,0.538616,0.000000,0.538616,0.207909
4,P0005,23,Female,Insomnia,14.84,88.70,Patient P0005 diagnosed with Insomnia.,1,patient p diagnosed with insomnia,"['patient', 'p', 'diagnosed', 'with', 'insomnia']",...,0.000000,0.806607,0.000000,0.0,0.000000,0.341265,0.000000,0.000000,0.000000,0.341265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,P0996,23,Female,Insomnia,41.57,83.14,Patient P0996 diagnosed with Insomnia.,1,patient p diagnosed with insomnia,"['patient', 'p', 'diagnosed', 'with', 'insomnia']",...,0.000000,0.806607,0.000000,0.0,0.000000,0.341265,0.000000,0.000000,0.000000,0.341265
996,P0997,49,Female,No Disorder,42.12,84.85,Patient P0997 diagnosed with No Disorder.,1,patient p diagnosed with disorder,"['patient', 'p', 'diagnosed', 'with', 'disorder']",...,0.895138,0.000000,0.000000,0.0,0.000000,0.257376,0.000000,0.000000,0.000000,0.257376
997,P0998,75,Male,Obstructive Sleep Apnea,18.66,90.08,Patient P0998 diagnosed with Obstructive Sleep...,1,patient p diagnosed with obstructive sleep apnea,"['patient', 'p', 'diagnosed', 'with', 'obstruc...",...,0.000000,0.000000,0.000000,0.0,0.524444,0.241437,0.000000,0.524444,0.000000,0.241437
998,P0999,55,Female,Insomnia,31.60,95.04,Patient P0999 diagnosed with Insomnia.,1,patient p diagnosed with insomnia,"['patient', 'p', 'diagnosed', 'with', 'insomnia']",...,0.000000,0.806607,0.000000,0.0,0.000000,0.341265,0.000000,0.000000,0.000000,0.341265


In [3]:
# Codificar la columna 'Gender'
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

# Codificar la variable target multicategoría
le = LabelEncoder()
df['Sleep_Disorder_Label'] = le.fit_transform(df['Sleep_Disorder_Type'])

# Separar características (X) y etiquetas
X = df[['Age', 'Gender', 'AHI_Score', 'SaO2_Level']]
y_binary = df['Diagnosis_Confirmed']                 # si tiene o no trastorno
y_multiclass = df['Sleep_Disorder_Label']            # tipo de trastorno

In [4]:
# División del conjunto de datos en entrenamiento y prueba

# División para modelo binario
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X, y_binary, test_size=0.25, random_state=42)

# División para modelo multiclase (solo donde sí hay diagnóstico confirmado)
X_multi = df[df['Diagnosis_Confirmed'] == 1][['Age', 'Gender', 'AHI_Score', 'SaO2_Level']]
y_multi = df[df['Diagnosis_Confirmed'] == 1]['Sleep_Disorder_Label']
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_multi, y_multi, test_size=0.25, random_state=42)


In [5]:
# Modelo para clasificación binaria
model_binary = RandomForestClassifier(random_state=42)
model_binary.fit(X_train_b, y_train_b)

# Modelo para clasificación multiclase
model_multi = RandomForestClassifier(random_state=42)
model_multi.fit(X_train_m, y_train_m)


### Evaluación Modelo de predicción

In [9]:
# Evaluación modelo binario
print("🎯 Diagnóstico Confirmado (Sí/No):")
print(classification_report(y_test_b, model_binary.predict(X_test_b)))

🎯 Diagnóstico Confirmado (Sí/No):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00       222

    accuracy                           1.00       250
   macro avg       1.00      1.00      1.00       250
weighted avg       1.00      1.00      1.00       250



In [7]:
# Evaluación modelo multiclase
print("🧠 Tipo de Trastorno (solo si tiene diagnóstico):")
print(classification_report(y_test_m, model_multi.predict(X_test_m), target_names=le.classes_))

🧠 Tipo de Trastorno (solo si tiene diagnóstico):
                         precision    recall  f1-score   support

               Insomnia       0.25      0.37      0.30        52
             Narcolepsy       0.14      0.12      0.13        32
            No Disorder       0.00      0.00      0.00        16
Obstructive Sleep Apnea       0.35      0.36      0.35        73
  Restless Leg Syndrome       0.32      0.23      0.27        48

               accuracy                           0.27       221
              macro avg       0.21      0.22      0.21       221
           weighted avg       0.27      0.27      0.26       221



#### XGBoost

In [None]:
# Codificación y limpieza
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df['Sleep_Disorder_Label'] = LabelEncoder().fit_transform(df['Sleep_Disorder_Type'])

# Seleccionar características
features = ['Age', 'Gender', 'AHI_Score', 'SaO2_Level']
X_all = df[df['Diagnosis_Confirmed'] == 1][features]
y_all = df[df['Diagnosis_Confirmed'] == 1]['Sleep_Disorder_Label']

# Escalar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_all)

# División del dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_all, test_size=0.25, random_state=42, stratify=y_all)

In [16]:
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

xgb_model.fit(X_train_m, y_train_m)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [19]:
from sklearn.metrics import accuracy_score

y_pred = xgb_model.predict(X_test_m)
print("Accuracy:", accuracy_score(y_test_m, y_pred))
print(classification_report(y_test_m, y_pred))


Accuracy: 0.24434389140271492
              precision    recall  f1-score   support

           0       0.20      0.25      0.22        52
           1       0.12      0.09      0.11        32
           2       0.00      0.00      0.00        16
           3       0.32      0.38      0.35        73
           4       0.26      0.21      0.23        48

    accuracy                           0.24       221
   macro avg       0.18      0.19      0.18       221
weighted avg       0.23      0.24      0.23       221



#### Predicción individual

In [24]:
def predecir_tipo_xgb(row):
    input_data = scaler.transform([row[features]])
    clase = xgb_model.predict(input_data)[0]
    return clase


In [25]:
# Escalar los datos antes de predecir
from sklearn.preprocessing import StandardScaler

# Ensure scaler is defined
scaler = StandardScaler()
scaler.fit(df[['Age', 'Gender', 'AHI_Score', 'SaO2_Level']])

input_data = scaler.transform([df.iloc[0][['Age', 'Gender', 'AHI_Score', 'SaO2_Level']]])
predicted_label = xgb_model.predict(input_data)[0]

# Inversión del label encoding
nombre = le.inverse_transform([predicted_label])[0]
print("Predicción avanzada:", nombre)


Predicción avanzada: Restless Leg Syndrome




### CV

### pruebas

In [33]:
def predecir_trastorno(row):
    input_data = pd.DataFrame([row[['Age', 'Gender', 'AHI_Score', 'SaO2_Level']]])
    tiene_trastorno = model_binary.predict(input_data)[0]

    if tiene_trastorno == 1:
        tipo_trastorno = model_multi.predict(input_data)[0]
        nombre_trastorno = le.inverse_transform([tipo_trastorno])[0]
        return f"Tiene trastorno del sueño: {nombre_trastorno}"
    else:
        return "No tiene trastorno del sueño"



In [34]:
resultado = predecir_trastorno(df.iloc[0])
print(resultado)


Tiene trastorno del sueño: Restless Leg Syndrome


---


#### MLP (Multilayer Perceptron)

In [36]:
%pip install tensorflow

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report

# 1. Cargar datos
df = pd.read_csv("df_final.csv")

# 2. Preprocesamiento
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
features = ['Age', 'Gender', 'AHI_Score', 'SaO2_Level']
df = df[df['Diagnosis_Confirmed'] == 1]  # Solo si hay diagnóstico

X = df[features]
y = df['Sleep_Disorder_Type']

# Codificar variable objetivo
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_cat = to_categorical(y_encoded)  # Necesario para softmax

# Escalar
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_cat, test_size=0.2, stratify=y_cat, random_state=42)


Note: you may need to restart the kernel to use updated packages.


---

In [37]:
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(y_cat.shape[1], activation='softmax')  # tantas salidas como clases
])

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=16)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.2216 - loss: 1.6082 - val_accuracy: 0.3390 - val_loss: 1.5494
Epoch 2/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3295 - loss: 1.5372 - val_accuracy: 0.3220 - val_loss: 1.5390
Epoch 3/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2743 - loss: 1.5246 - val_accuracy: 0.3051 - val_loss: 1.5416
Epoch 4/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2684 - loss: 1.5484 - val_accuracy: 0.3333 - val_loss: 1.5452
Epoch 5/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3201 - loss: 1.5316 - val_accuracy: 0.3107 - val_loss: 1.5498
Epoch 6/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3126 - loss: 1.5407 - val_accuracy: 0.2768 - val_loss: 1.5450
Epoch 7/50
[1m45/45[0m [32m━━━━━━━━━━

In [38]:
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

print(classification_report(y_true_labels, y_pred_labels, target_names=le.classes_))


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
                         precision    recall  f1-score   support

               Insomnia       0.26      0.25      0.26        44
             Narcolepsy       0.20      0.04      0.06        28
            No Disorder       0.00      0.00      0.00        14
Obstructive Sleep Apnea       0.29      0.60      0.40        55
  Restless Leg Syndrome       0.22      0.11      0.15        36

               accuracy                           0.28       177
              macro avg       0.20      0.20      0.17       177
           weighted avg       0.23      0.28      0.23       177



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
def predecir_disorder_dl(row):
    input_data = scaler.transform([row[features]])
    pred = model.predict(input_data)
    clase = np.argmax(pred)
    return le.inverse_transform([clase])[0]

# Ejemplo
print("Predicción DL:", predecir_disorder_dl(df.iloc[0]))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step




Predicción DL: Obstructive Sleep Apnea
