In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
import joblib


In [2]:
# Load dataset
df = pd.read_csv("LengthOfStay.csv")

In [3]:
drop_columns = ['vdate', 'discharged', 'eid', 'substancedependence', 'psychother', 
                'fibrosisandother', 'neutrophils', 'bloodureanitro', 
                'secondarydiagnosisnonicd9', 'facid']

In [4]:
df = df.drop(columns=drop_columns)

In [5]:
df = df.dropna(axis=1)

In [6]:
# Convert categorical variables to numeric if any
for col in df.select_dtypes(include=['object']).columns:
    df[col] = pd.factorize(df[col])[0]

In [7]:
# Separate features and target variable
imputer = SimpleImputer(strategy='mean')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [8]:
imputer = SimpleImputer(strategy='most_frequent')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [9]:
X = df.drop(columns=['lengthofstay'])  # Assuming 'lengthofstay' is the target column
y = df['lengthofstay']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')
model.fit(X_train_scaled, y_train)


In [27]:
y_pred = model.predict(X_test_scaled)

In [29]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.5260
Precision: 0.5122
Recall: 0.5260
F1 Score: 0.5069


In [31]:
joblib.dump(model, "health_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [33]:
X

Unnamed: 0,rcount,gender,asthma,irondef,pneum,psychologicaldisordermajor,depress,malnutrition,hemo,sodium,glucose,creatinine,bmi,pulse,respiration
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.361132,192.476918,1.390722,30.432418,96.0,6.5
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136.731692,94.078507,0.943164,28.460516,61.0,6.5
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,133.058514,130.530524,1.065750,28.843812,64.0,6.5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,138.994023,163.377028,0.906862,27.959007,76.0,6.5
4,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,138.634836,94.886654,1.242854,30.258927,67.0,5.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,132.614977,171.422555,0.650323,30.063069,80.0,6.5
99996,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,138.327320,122.342450,1.521424,28.969548,61.0,6.5
99997,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,136.695905,108.288106,1.025677,26.354919,61.0,6.9
99998,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,135.980516,111.750731,1.035400,29.193462,59.0,5.6
