In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import joblib

In [2]:
file_path = "LengthOfStay.csv"
df = pd.read_csv(file_path)

In [3]:
categorical_cols = ['gender', 'rcount', 'facid']
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()


In [7]:
imputer_num = SimpleImputer(strategy='median')
df[numeric_cols] = imputer_num.fit_transform(df[numeric_cols])


In [9]:
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

In [11]:
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))


In [13]:
df['lengthofstay'] = df['lengthofstay'].replace({'5+': 5}).astype(float)
df['lengthofstay'] = (df['lengthofstay'] > 3).astype(int)


In [15]:
df = df.drop(columns=['vdate','discharged'])


In [17]:
df.head()

Unnamed: 0,eid,rcount,gender,dialysisrenalendstage,asthma,irondef,pneum,substancedependence,psychologicaldisordermajor,depress,...,sodium,glucose,bloodureanitro,creatinine,bmi,pulse,respiration,secondarydiagnosisnonicd9,facid,lengthofstay
0,1.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,140.361132,192.476918,12.0,1.390722,30.432418,96.0,6.5,4.0,1,0
1,2.0,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,136.731692,94.078507,8.0,0.943164,28.460516,61.0,6.5,1.0,0,1
2,3.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,133.058514,130.530524,12.0,1.06575,28.843812,64.0,6.5,2.0,1,0
3,4.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,138.994023,163.377028,12.0,0.906862,27.959007,76.0,6.5,1.0,0,0
4,5.0,0,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,138.634836,94.886654,11.5,1.242854,30.258927,67.0,5.6,2.0,4,1


In [19]:
X = df.drop(columns=['lengthofstay'])
y = df['lengthofstay']


In [21]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [25]:
model = LogisticRegression(max_iter=500, solver='lbfgs', random_state=42)
model.fit(X_train, y_train)


In [27]:
y_pred = model.predict(X_test)

In [29]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [31]:
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)


Model Accuracy: 86.37%
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.91      0.86      9340
           1       0.91      0.82      0.87     10660

    accuracy                           0.86     20000
   macro avg       0.87      0.87      0.86     20000
weighted avg       0.87      0.86      0.86     20000



In [76]:
import pickle

# Save the trained model
with open("hospital_length_of_stay_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

print("Model saved successfully!")


Model saved successfully!


In [33]:
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [35]:
X

Unnamed: 0,eid,rcount,gender,dialysisrenalendstage,asthma,irondef,pneum,substancedependence,psychologicaldisordermajor,depress,...,neutrophils,sodium,glucose,bloodureanitro,creatinine,bmi,pulse,respiration,secondarydiagnosisnonicd9,facid
0,1.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14.20,140.361132,192.476918,12.0,1.390722,30.432418,96.0,6.5,4.0,1
1,2.0,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.10,136.731692,94.078507,8.0,0.943164,28.460516,61.0,6.5,1.0,0
2,3.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.90,133.058514,130.530524,12.0,1.065750,28.843812,64.0,6.5,2.0,1
3,4.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.40,138.994023,163.377028,12.0,0.906862,27.959007,76.0,6.5,1.0,0
4,5.0,0,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,9.05,138.634836,94.886654,11.5,1.242854,30.258927,67.0,5.6,2.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996.0,3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.30,132.614977,171.422555,12.0,0.650323,30.063069,80.0,6.5,1.0,1
99996,99997.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.30,138.327320,122.342450,12.0,1.521424,28.969548,61.0,6.5,1.0,1
99997,99998.0,1,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,7.70,136.695905,108.288106,12.0,1.025677,26.354919,61.0,6.9,1.0,2
99998,99999.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,8.20,135.980516,111.750731,16.0,1.035400,29.193462,59.0,5.6,1.0,1


In [39]:
y

0        0
1        1
2        0
3        0
4        1
        ..
99995    1
99996    0
99997    1
99998    1
99999    0
Name: lengthofstay, Length: 100000, dtype: int32