In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# load data
df = pd.read_csv('AP_ICD_Lipase_CRP_Amylase_Dataset_Comorbid_Clean.csv')

df.head()

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,in_hospital_death,charttime_crp,crp_level,charttime_amylase,amylase_level,is_confirmed_ap,icd_code,seq_num,icd_version,severity
0,10004606,29242151.0,2159-02-20 18:30:00,-0.044243,2159-02-20 13:43:00,2159-03-06 16:51:00,1.0,0.355612,False,,166.0,,1327.0,True,G40409,1.0,10.0,severe
1,10004606,29242151.0,2159-02-20 18:30:00,-0.044243,2159-02-20 13:43:00,2159-03-06 16:51:00,0.0,0.355612,False,,166.0,,1327.0,True,K8510,2.0,10.0,severe
2,10004606,29242151.0,2159-02-20 18:30:00,-0.044243,2159-02-20 13:43:00,2159-03-06 16:51:00,0.0,0.355612,False,,166.0,,1327.0,True,G9340,3.0,10.0,severe
3,10004606,29242151.0,2159-02-20 18:30:00,-0.044243,2159-02-20 13:43:00,2159-03-06 16:51:00,0.0,0.355612,False,,166.0,,1327.0,True,K8064,4.0,10.0,severe
4,10004606,29242151.0,2159-02-20 18:30:00,-0.044243,2159-02-20 13:43:00,2159-03-06 16:51:00,1.0,0.355612,False,,166.0,,1327.0,True,E871,5.0,10.0,severe


In [2]:
# check unique value in subject_id
df['subject_id'].nunique()

# check in_hospital_death value
df['in_hospital_death'].value_counts()



in_hospital_death
False    72511
True     16405
Name: count, dtype: int64

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
# each subject_id has multiple rows, we need to aggregate the data
df_clean_new = df.groupby('subject_id').agg({
    'lipase_level': 'mean',
    'crp_level': 'mean',
    'amylase_level': 'mean',
    'age': 'first',
    'gender': 'first',
    'severity': 'last',
}).reset_index()

print(df_clean_new.head())

# Drop unnecessary columns (dates and IDs) for modeling purposes
df_clean = df.drop(columns=['subject_id', 'hadm_id', 'charttime', 'admittime', 'dischtime', 
                            'charttime_crp', 'charttime_amylase', 'icd_code'])
# df_clean = df_clean_new


print(df_clean.head())
# Encode 'severity' (target variable) as numerical
label_encoder = LabelEncoder()
df_clean['severity'] = label_encoder.fit_transform(df_clean['severity'])

# Convert boolean columns to integers (0 and 1)
bool_columns = df_clean.select_dtypes(include='bool').columns
df_clean[bool_columns] = df_clean[bool_columns].astype(int)

# Split dataset into features (X) and target (y)
X = df_clean.drop(columns=['severity'])
y = df_clean['severity']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


   subject_id  lipase_level  crp_level  amylase_level       age  gender  \
0    10004606     -0.044243      166.0         1327.0  0.355612     1.0   
1    10006431     -0.071294      166.0         1327.0  0.478369     1.0   
2    10017531     -0.049983      185.1         1327.0  0.294233     1.0   
3    10021357     -0.055003      166.0         1327.0  2.012836     1.0   
4    10036086     -0.062997      166.0         1327.0 -0.074039     1.0   

   severity  
0    severe  
1  moderate  
2  critical  
3    severe  
4  critical  
   lipase_level  gender       age  in_hospital_death  crp_level  \
0     -0.044243     1.0  0.355612              False      166.0   
1     -0.044243     0.0  0.355612              False      166.0   
2     -0.044243     0.0  0.355612              False      166.0   
3     -0.044243     0.0  0.355612              False      166.0   
4     -0.044243     1.0  0.355612              False      166.0   

   amylase_level  is_confirmed_ap  seq_num  icd_version severi

In [4]:


model = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=15)

model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, report, conf_matrix


(0.7995389113810166,
 '              precision    recall  f1-score   support\n\n    critical       0.75      0.93      0.83      7908\n       death       1.00      1.00      1.00      3219\n        mild       1.00      0.38      0.55       196\n    moderate       0.72      0.72      0.72      3968\n      severe       0.96      0.28      0.44      2493\n\n    accuracy                           0.80     17784\n   macro avg       0.89      0.66      0.71     17784\nweighted avg       0.82      0.80      0.78     17784\n',
 array([[7352,    0,    0,  553,    3],
        [   0, 3219,    0,    0,    0],
        [  76,    0,   74,   42,    4],
        [1080,    0,    0, 2869,   19],
        [1249,    0,    0,  539,  705]], dtype=int64))

In [5]:

import pandas as pd
import numpy as np
from xgboost import XGBClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_estimators=200, max_depth=15, learning_rate=0.1)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)
 
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.99
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7908
           1       1.00      1.00      1.00      3219
           2       0.97      0.97      0.97       196
           3       0.98      0.98      0.98      3968
           4       0.97      0.97      0.97      2493

    accuracy                           0.99     17784
   macro avg       0.98      0.98      0.98     17784
weighted avg       0.99      0.99      0.99     17784



In [6]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.71


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report




model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))




Accuracy: 0.64
              precision    recall  f1-score   support

           0       0.60      0.84      0.70      7908
           1       1.00      1.00      1.00      3219
           2       0.00      0.00      0.00       196
           3       0.45      0.38      0.41      3968
           4       0.56      0.00      0.00      2493

    accuracy                           0.64     17784
   macro avg       0.52      0.45      0.42     17784
weighted avg       0.62      0.64      0.58     17784



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
X

Unnamed: 0,lipase_level,gender,age,in_hospital_death,crp_level,amylase_level,is_confirmed_ap,seq_num,icd_version
0,-0.044243,1.0,0.355612,0,166.0,1327.0,1,1.0,10.0
1,-0.044243,0.0,0.355612,0,166.0,1327.0,1,2.0,10.0
2,-0.044243,0.0,0.355612,0,166.0,1327.0,1,3.0,10.0
3,-0.044243,0.0,0.355612,0,166.0,1327.0,1,4.0,10.0
4,-0.044243,1.0,0.355612,0,166.0,1327.0,1,5.0,10.0
...,...,...,...,...,...,...,...,...,...
88911,-0.053033,1.0,-0.503690,1,166.0,1327.0,1,10.0,9.0
88912,-0.053033,1.0,-0.503690,1,166.0,1327.0,1,11.0,9.0
88913,-0.053033,1.0,-0.503690,1,166.0,1327.0,1,12.0,9.0
88914,-0.053033,0.0,-0.503690,1,166.0,1327.0,1,13.0,9.0


In [16]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report




# Identify categorical features present in X
categorical_features = ['gender', 'is_confirmed_ap', 'in_hospital_death']

# Ensure categorical features are of type 'str' or 'category'
for col in categorical_features:
    if col in X.columns:
        X[col] = X[col].astype(str)
    else:
        raise KeyError(f"'{col}' column is missing from the features DataFrame.")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Pool for CatBoost
train_pool = Pool(X_train, y_train, cat_features=categorical_features)
test_pool = Pool(X_test, y_test, cat_features=categorical_features)

# Initialize and train the model
model = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, verbose=0)
model.fit(train_pool)

# Make predictions
y_pred = model.predict(test_pool)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.73
              precision    recall  f1-score   support

           0       0.69      0.91      0.79      7908
           1       1.00      1.00      1.00      3219
           2       0.91      0.30      0.45       196
           3       0.61      0.60      0.60      3968
           4       0.83      0.08      0.15      2493

    accuracy                           0.73     17784
   macro avg       0.81      0.58      0.60     17784
weighted avg       0.75      0.73      0.69     17784



In [9]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Convert target variable to one-hot encoded format
y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

# Build a DNN model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))  
model.add(Dense(32, activation='relu'))                             
model.add(Dense(16, activation='relu'))                              
model.add(Dense(8, activation='relu'))                             
model.add(Dense(len(np.unique(y_train)), activation='softmax'))      

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_encoded, epochs=100, batch_size=10, validation_split=0.2)

# Make predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Calculate accuracy and display the classification report
accuracy = accuracy_score(y_test, y_pred_classes)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred_classes))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m5691/5691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step - accuracy: 0.6317 - loss: 0.8464 - val_accuracy: 0.6717 - val_loss: 0.7464
Epoch 2/100
[1m5691/5691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.6786 - loss: 0.7384 - val_accuracy: 0.6780 - val_loss: 0.7282
Epoch 3/100
[1m5691/5691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - accuracy: 0.6809 - loss: 0.7285 - val_accuracy: 0.6812 - val_loss: 0.7241
Epoch 4/100
[1m5691/5691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.6788 - loss: 0.7207 - val_accuracy: 0.6796 - val_loss: 0.7247
Epoch 5/100
[1m5691/5691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.6811 - loss: 0.7182 - val_accuracy: 0.6879 - val_loss: 0.7136
Epoch 6/100
[1m5691/5691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - accuracy: 0.6834 - loss: 0.7131 - val_accuracy: 0.6859 - val_loss: 0.7144
Epoch 7/