In [21]:
import pandas as pd
import numpy as np
import json
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# Load files
signals_df = pd.read_csv("/content/master_features.csv")
diagnoses_df = pd.read_csv("/content/DIAGNOSES_ICD.csv")
with open("/content/structured_lab_results (2).json", "r") as file:
    ocr_data = json.load(file)

In [22]:
# Step 1: Aggregate lab values
lab_records = []
for record in ocr_data:
    hadm_id = record['hadm_id']
    results = record['lab_results']
    latest = {}
    for res in results:
        label = res['label']
        value = res['value']
        charttime = res['charttime']
        if label not in latest or charttime > latest[label][1]:
            latest[label] = (value, charttime)
    row = {'hadm_id': hadm_id}
    for label, (value, _) in latest.items():
        row[label] = value
    lab_records.append(row)
labs_df = pd.DataFrame(lab_records)

# Step 2: Merge features
merged = pd.merge(signals_df, labs_df, on="hadm_id", how="inner")
diagnoses_grouped = diagnoses_df.groupby("hadm_id")["icd9_code"].apply(list).reset_index()
merged = pd.merge(merged, diagnoses_grouped, on="hadm_id", how="inner")

# Step 3: Filter top ICD codes (â‰¥30 cases)
all_labels = [label for sublist in merged["icd9_code"] for label in sublist]
label_counts = Counter(all_labels)
top_labels = set([label for label, count in label_counts.items() if count >= 30])
merged["icd9_code"] = merged["icd9_code"].apply(lambda codes: [c for c in codes if c in top_labels])
merged = merged[merged["icd9_code"].map(len) > 0]

# Step 4: Prepare features
X = merged.drop(columns=["hadm_id", "icd9_code"])
X = pd.get_dummies(X)
X = X.dropna(axis=1, how='all')
imputer = SimpleImputer(strategy="mean")
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Step 5: Prepare labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(merged["icd9_code"])

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Step 7: Train model
model = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

# Step 8: Predict
probs_list = model.predict_proba(X_test)
y_pred_bin = np.zeros((X_test.shape[0], len(probs_list)), dtype=int)
for j, class_probs in enumerate(probs_list):
    y_pred_bin[:, j] = (class_probs[:, 1] > 0.3).astype(int)

# Step 9: Load ICD9 descriptions from your file
icd9_df = pd.read_csv("/content/icd9.txt", sep="\t", encoding='latin-1') # Specify encoding as 'latin-1'
icd9_df.columns = icd9_df.columns.str.strip()
icd9_df["DIAGNOSIS CODE"] = icd9_df["DIAGNOSIS CODE"].astype(str).str.strip()
icd9_map = dict(zip(icd9_df["DIAGNOSIS CODE"], icd9_df["LONG DESCRIPTION"]))

# Step 10: Decode labels to names
def decode_labels(icd_list):
    return [icd9_map.get(code, f"ICD9-{code}") for code in icd_list]

# Step 11: Evaluation report
target_names = [icd9_map.get(code, f"ICD9-{code}") for code in mlb.classes_]
print("\nâœ… Final Model Evaluation Report (Disease Names):\n")
print(classification_report(y_test, y_pred_bin, target_names=target_names, zero_division=0))

# Step 12: Sample predictions with disease names
true_diseases = mlb.inverse_transform(y_test)
pred_diseases = mlb.inverse_transform(y_pred_bin)

for i in range(5):
    print(f"\nðŸ§ª Sample {i + 1}")
    print("âœ… True Diseases     :", decode_labels(true_diseases[i]))
    print("ðŸ¤– Predicted Diseases:", decode_labels(pred_diseases[i]))



âœ… Final Model Evaluation Report (Disease Names):

                                                                                                            precision    recall  f1-score   support

Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled       1.00      0.86      0.92         7
                                                                        Unspecified essential hypertension       0.40      1.00      0.57         8
                                                                                       Atrial fibrillation       0.62      1.00      0.77        10
                                                                     Congestive heart failure, unspecified       0.85      1.00      0.92        11
                                                                                 Acute respiratory failure       0.67      1.00      0.80         6
                                                          

In [25]:
import joblib

# === Save trained model ===
joblib.dump(model, "/content/disease_model.pkl")
print("âœ… Model saved as disease_model.pkl")

# === Save predictions to CSV ===
rows = []
for i, (true_codes, pred_codes) in enumerate(zip(true_diseases, pred_diseases), 1):
    true_names = "; ".join(decode_labels(true_codes))
    pred_names = "; ".join(decode_labels(pred_codes))
    rows.append({"Sample": i, "True Diseases": true_names, "Predicted Diseases": pred_names})

pred_df = pd.DataFrame(rows)
pred_df.to_csv("/content/predictions.csv", index=False)
print("âœ… Predictions saved as predictions.csv")


âœ… Model saved as disease_model.pkl
âœ… Predictions saved as predictions.csv
