In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import joblib

# --------------------------------
# 1. Load Dataset
# --------------------------------
df = pd.read_csv('million_patients.csv')
print(f"Loaded dataset with {len(df):,} records")

# --------------------------------
# 2. Preprocessing
# --------------------------------
# Drop unwanted columns
df.drop(['patient_id', 'admit_date', 'discharge_date', 'los'], axis=1, inplace=True)

# Encode Categorical Data
label_encoders = {}
categorical_cols = ['gender', 'region', 'diagnosis', 'category', 'medication', 'outcome']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Convert Symptoms to Multi-Hot Encoding
df['symptoms'] = df['symptoms'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
symptom_encoded = pd.DataFrame(mlb.fit_transform(df['symptoms']), columns=mlb.classes_)

# Merge the symptoms into the main DataFrame
df = pd.concat([df.drop('symptoms', axis=1), symptom_encoded], axis=1)

# Scale Numeric Columns
scaler = StandardScaler()
numeric_cols = ['age', 'bmi', 'glucose', 'systolic_bp', 'diastolic_bp', 'wbc']

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Split Features and Labels
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split into Train/Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training Size: {len(X_train):,}, Testing Size: {len(X_test):,}")

# --------------------------------
# 3. Build the Model
# --------------------------------
model = Sequential([
    Dense(256, activation='relu', input_shape=(X.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(64, activation='relu'),
    Dense(len(np.unique(y)), activation='softmax')  # Output layer with # of diseases
])

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# --------------------------------
# 4. Train the Model
# --------------------------------
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=2048,
    callbacks=[early_stopping]
)

# --------------------------------
# 5. Evaluate the Model
# --------------------------------
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

# --------------------------------
# 6. Save the Model and Encoders
# --------------------------------
model.save('disease_prediction_model.h5')
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(mlb, 'multi_label_binarizer.pkl')

print("✅ Model, Encoders, Scaler, and Multi-Hot Encoder saved successfully.")


Loaded dataset with 1,000,000 records
Training Size: 800,000, Testing Size: 200,000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.3396 - loss: 1.8425 - val_accuracy: 0.4475 - val_loss: 0.9689
Epoch 2/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - accuracy: 0.4465 - loss: 0.9781 - val_accuracy: 0.4495 - val_loss: 0.9576
Epoch 3/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.4485 - loss: 0.9660 - val_accuracy: 0.4481 - val_loss: 0.9581
Epoch 4/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.4490 - loss: 0.9625 - val_accuracy: 0.4483 - val_loss: 0.9574
Epoch 5/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.4489 - loss: 0.9613 - val_accuracy: 0.4487 - val_loss: 0.9559
Epoch 6/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.4492 - loss: 0.9594 - val_accuracy: 0.4490 - val_loss: 0.9561
Epoch 7/50
[1m391/39




✅ Model, Encoders, Scaler, and Multi-Hot Encoder saved successfully.


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import joblib

# --------------------------------
# 1. Load Dataset
# --------------------------------
df = pd.read_csv('million_patients.csv')
print(f"Loaded dataset with {len(df):,} records")

# --------------------------------
# 2. Preprocessing
# --------------------------------
# Drop unwanted columns
df.drop(['patient_id', 'admit_date', 'discharge_date', 'los'], axis=1, inplace=True)

# Encode Categorical Data
label_encoders = {}
categorical_cols = ['gender', 'region', 'diagnosis', 'category', 'medication', 'outcome']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Convert Symptoms to Multi-Hot Encoding
df['symptoms'] = df['symptoms'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
symptom_encoded = pd.DataFrame(mlb.fit_transform(df['symptoms']), columns=mlb.classes_)

# Merge the symptoms into the main DataFrame
df = pd.concat([df.drop('symptoms', axis=1), symptom_encoded], axis=1)

# Scale Numeric Columns
scaler = StandardScaler()
numeric_cols = ['age', 'bmi', 'glucose', 'systolic_bp', 'diastolic_bp', 'wbc']

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Split Features and Labels
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split into Train/Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training Size: {len(X_train):,}, Testing Size: {len(X_test):,}")

# --------------------------------
# 3. Build the Model
# --------------------------------
model = Sequential([
    Dense(256, activation='relu', input_shape=(X.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(64, activation='relu'),
    Dense(len(np.unique(y)), activation='softmax')  # Output layer with # of diseases
])

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# --------------------------------
# 4. Train the Model
# --------------------------------
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=2048,
    callbacks=[early_stopping]
)

# --------------------------------
# 5. Evaluate the Model
# --------------------------------
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

# --------------------------------
# 6. Save the Model and Encoders
# --------------------------------
model.save('model/disease_prediction_model.h5')
joblib.dump(label_encoders, 'model/label_encoders.pkl')
joblib.dump(scaler, 'model/scaler.pkl')
joblib.dump(mlb, 'model/multi_label_binarizer.pkl')

print("✅ Model, Encoders, Scaler, and Multi-Hot Encoder saved successfully.")


Loaded dataset with 1,000,000 records
Training Size: 800,000, Testing Size: 200,000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - accuracy: 0.3376 - loss: 1.8664 - val_accuracy: 0.4490 - val_loss: 0.9687
Epoch 2/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.4478 - loss: 0.9773 - val_accuracy: 0.4475 - val_loss: 0.9591
Epoch 3/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.4487 - loss: 0.9665 - val_accuracy: 0.4479 - val_loss: 0.9579
Epoch 4/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.4489 - loss: 0.9629 - val_accuracy: 0.4476 - val_loss: 0.9557
Epoch 5/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.4497 - loss: 0.9604 - val_accuracy: 0.4487 - val_loss: 0.9567
Epoch 6/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - accuracy: 0.4490 - loss: 0.9598 - val_accuracy: 0.4477 - val_loss: 0.9553
Epoch 7/50
[1m391/39



Test Accuracy: 0.4475
✅ Model, Encoders, Scaler, and Multi-Hot Encoder saved successfully.


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import joblib

# -------------------------------------
# 1. Load Dataset
# -------------------------------------
df = pd.read_csv('million_patients.csv')
print(f"Loaded dataset with {len(df):,} records")
print("Columns in dataset:", df.columns.tolist())

# -------------------------------------
# 2. Drop Unnecessary Columns
# -------------------------------------
cols_to_drop = ['patient_id', 'admit_date', 'discharge_date', 'los']
df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

# -------------------------------------
# 3. Process Numeric Features with RobustScaler
# -------------------------------------
# Use the available numeric columns from the dataset.
numeric_cols = ['age', 'bmi', 'glucose', 'systolic_bp', 'diastolic_bp', 'wbc']

scaler = RobustScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
joblib.dump(scaler, 'robust_scaler.pkl')
print("Numeric features scaled.")

# -------------------------------------
# 4. Process Categorical Features using LabelEncoder
# -------------------------------------
categorical_cols = ['gender', 'region', 'diagnosis', 'category', 'medication', 'outcome']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
joblib.dump(label_encoders, 'label_encoders.pkl')
print("Categorical features encoded.")

# -------------------------------------
# 5. Process 'symptoms' Using MultiLabelBinarizer
# -------------------------------------
# Assuming the 'symptoms' column contains comma-separated symptom strings
df['symptoms'] = df['symptoms'].apply(lambda x: x.split(', '))
mlb = MultiLabelBinarizer()
symptom_encoded = pd.DataFrame(mlb.fit_transform(df['symptoms']), columns=mlb.classes_)
joblib.dump(mlb, 'multi_label_binarizer.pkl')
df = pd.concat([df.drop('symptoms', axis=1), symptom_encoded], axis=1)
print("Symptoms encoded successfully.")

# -------------------------------------
# 6. Build Feature Matrix and Target
# -------------------------------------
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training size: {X_train.shape}, Testing size: {X_test.shape}")

# If your model requires dense arrays, convert the DataFrames to numpy arrays:
X_train_dense = X_train.to_numpy()
X_test_dense = X_test.to_numpy()

# -------------------------------------
# 7. Build the Deep Neural Network Model
# -------------------------------------
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_dense.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    
    Dense(64, activation='relu'),
    Dense(len(np.unique(y)), activation='softmax')  # Output layer with number of unique diagnoses
])

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# -------------------------------------
# 8. Train the Model
# -------------------------------------
history = model.fit(
    X_train_dense, y_train,
    validation_data=(X_test_dense, y_test),
    epochs=50,
    batch_size=2048,
    callbacks=[early_stopping]
)

# -------------------------------------
# 9. Evaluate the Model
# -------------------------------------
test_loss, test_accuracy = model.evaluate(X_test_dense, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

# -------------------------------------
# 10. Save the Model and Preprocessing Objects
# -------------------------------------
model.save('model2/disease_prediction_model.h5')
joblib.dump(label_encoders, 'model2/label_encoders.pkl')
joblib.dump(scaler, 'model2/robust_scaler.pkl')
joblib.dump(mlb, 'model2/multi_label_binarizer.pkl')

print("✅ Model, Encoders, and Scaler saved successfully.")


Loaded dataset with 1,000,000 records
Columns in dataset: ['patient_id', 'age', 'gender', 'region', 'bmi', 'diagnosis', 'category', 'severity', 'symptoms', 'glucose', 'systolic_bp', 'diastolic_bp', 'wbc', 'admit_date', 'los', 'discharge_date', 'medication', 'outcome']
Numeric features scaled.
Categorical features encoded.
Symptoms encoded successfully.
Training size: (800000, 130), Testing size: (200000, 130)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.3448 - loss: 1.8132 - val_accuracy: 0.4478 - val_loss: 0.9759
Epoch 2/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.4467 - loss: 0.9768 - val_accuracy: 0.4493 - val_loss: 0.9596
Epoch 3/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.4488 - loss: 0.9655 - val_accuracy: 0.4491 - val_loss: 0.9565
Epoch 4/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - accuracy: 0.4480 - loss: 0.9637 - val_accuracy: 0.4486 - val_loss: 0.9561
Epoch 5/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.4493 - loss: 0.9607 - val_accuracy: 0.4482 - val_loss: 0.9567
Epoch 6/50
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.4494 - loss: 0.9597 - val_accuracy: 0.4481 - val_loss: 0.9577
Epoch 7/50
[1m391/




✅ Model, Encoders, and Scaler saved successfully.
