In [None]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/AI03 GP/Autoimmune_Disorder_10k_with_All_Disorders.csv"
df = pd.read_csv(file_path)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Patient_ID,Age,Gender,Diagnosis,Sickness_Duration_Months,RBC_Count,Hemoglobin,Hematocrit,MCV,MCH,...,Anti_TIF1,Anti_epidermal_basement_membrane_IgA,Anti_OmpC,pANCA,Anti_tissue_transglutaminase,anti_Scl_70,Anti_Mi2,Anti_parietal_cell,Progesterone_antibodies,Anti_Sm
0,1,65,Female,Linear IgA disease,26,3.54,13.18,44.35,99.85,31.8,...,0,1,0,0,0,0,0,0,0,0
1,2,61,Male,Dermatomyositis,106,4.97,14.73,42.39,94.86,29.94,...,1,0,0,0,0,0,1,0,0,0
2,3,42,Male,Ord's thyroiditis,6,3.91,13.66,42.41,88.26,29.43,...,0,0,0,0,0,0,0,0,0,0
3,4,45,Female,Restless legs syndrome,11,4.57,14.96,40.1,85.95,27.42,...,0,0,0,0,0,0,0,0,0,0
4,5,78,Female,Autoimmune polyendocrine syndrome type 2 (APS2),30,4.88,11.21,37.58,93.16,29.87,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X = df.drop(columns=["Diagnosis"])
y = df["Diagnosis"]


In [None]:
print(y.value_counts())


Diagnosis
Normal                        2500
Endometriosis                  184
Dermatomyositis                183
Autoimmune oophoritis          170
Autoimmune orchitis            162
                              ... 
Sydenham's chorea               68
Lichen sclerosus                68
Rheumatic heart disease         67
Polyarteritis nodosa (PAN)      63
Neuromyotonia                   62
Name: count, Length: 116, dtype: int64


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

base_model = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(
        multi_class="multinomial",
        max_iter=2000,
        class_weight="balanced"  # important for rare diseases
    ))
])

model = CalibratedClassifierCV(
    base_model,
    method="isotonic",
    cv=5
)

model.fit(X_train, y_train)




In [None]:
def predict_disease_risk(patient_row, model):
    probs = model.predict_proba(patient_row)[0]
    classes = model.classes_

    risk_scores = {
        cls: round(prob * 100, 1)
        for cls, prob in zip(classes, probs)
    }
    return dict(sorted(risk_scores.items(), key=lambda x: -x[1]))


In [None]:
patient = X.iloc[[0]]  # one patient
risks = predict_disease_risk(patient, model)

for disease, score in risks.items():
    print(f"{disease}: {score}%")


Linear IgA disease: 61.4%
Giant cell arteritis: 1.1%
IgA nephropathy: 1.1%
Alopecia areata: 1.0%
Autoimmune orchitis: 0.9%
Autoimmune oophoritis: 0.8%
Endometriosis: 0.8%
Scleritis: 0.7%
Addison's disease: 0.6%
Antiphospholipid syndrome: 0.6%
Autoimmune polyendocrine syndrome type 3 (APS3): 0.6%
Myasthenia gravis: 0.6%
Polymyalgia rheumatica: 0.6%
Polymyositis: 0.6%
Sarcoidosis: 0.6%
Thrombotic thrombocytopenic purpura: 0.6%
Tolosa–Hunt syndrome: 0.6%
Vasculitis: 0.6%
Acute disseminated encephalomyelitis: 0.5%
Autoimmune retinopathy: 0.5%
Eosinophilic granulomatosis with polyangiitis (EGPA): 0.5%
Graves' disease: 0.5%
Guillain-Barré syndrome: 0.5%
Hidradenitis suppurativa: 0.5%
Immune thrombocytopenia: 0.5%
Inclusion body myositis: 0.5%
Interstitial lung disease: 0.5%
Interstitial nephritis: 0.5%
Microscopic polyangiitis (MPA): 0.5%
Mooren's ulcer: 0.5%
Multiple sclerosis: 0.5%
Opsoclonus myoclonus syndrome: 0.5%
Rheumatoid vasculitis: 0.5%
Undifferentiated connective tissue disease (U

In [None]:
from sklearn.metrics import roc_auc_score

probs = model.predict_proba(X_test)
print("Macro AUROC:", roc_auc_score(y_test, probs, multi_class="ovr"))


Macro AUROC: 0.7189831219367757


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV

base_model = Pipeline([
    ("prep", preprocessor),
    ("clf", HistGradientBoostingClassifier(
        max_depth=6,
        learning_rate=0.05,
        max_iter=300
    ))
])

model = CalibratedClassifierCV(
    base_model,
    method="isotonic",
    cv=5
)

model.fit(X_train, y_train)


In [None]:
def predict_disease_risk(patient_row, model):
    probs = model.predict_proba(patient_row)[0]
    classes = model.classes_

    risk_scores = {
        cls: round(prob * 100, 1)
        for cls, prob in zip(classes, probs)
    }
    return dict(sorted(risk_scores.items(), key=lambda x: -x[1]))


In [None]:
patient = X.iloc[[0]]  # one patient
risks = predict_disease_risk(patient, model)

for disease, score in risks.items():
    print(f"{disease}: {score}%")


Linear IgA disease: 60.7%
Endometriosis: 1.1%
Vasculitis: 1.0%
Autoimmune oophoritis: 0.8%
Autoimmune orchitis: 0.8%
Giant cell arteritis: 0.8%
Sarcoidosis: 0.8%
Thrombotic thrombocytopenic purpura: 0.8%
Alopecia areata: 0.7%
Morphea: 0.7%
Myasthenia gravis: 0.7%
Autoimmune polyendocrine syndrome type 3 (APS3): 0.6%
Hidradenitis suppurativa: 0.6%
IgA nephropathy: 0.6%
Immune thrombocytopenia: 0.6%
Inclusion body myositis: 0.6%
Lupus vasculitis: 0.6%
Premature ovarian failure: 0.6%
Addison's disease: 0.5%
Antiphospholipid syndrome: 0.5%
Balo concentric sclerosis: 0.5%
Eosinophilic granulomatosis with polyangiitis (EGPA): 0.5%
Fibromyalgia: 0.5%
Interstitial cystitis: 0.5%
Lambert–Eaton myasthenic syndrome: 0.5%
Mooren's ulcer: 0.5%
Opsoclonus myoclonus syndrome: 0.5%
Primary sclerosing cholangitis: 0.5%
Rheumatoid arthritis: 0.5%
Rheumatoid vasculitis: 0.5%
Scleritis: 0.5%
Tolosa–Hunt syndrome: 0.5%
Undifferentiated connective tissue disease (UCTD): 0.5%
Anti-NMDA receptor encephalitis:

In [None]:
from sklearn.metrics import roc_auc_score

probs = model.predict_proba(X_test)
print("Macro AUROC:", roc_auc_score(y_test, probs, multi_class="ovr"))


Macro AUROC: 0.7230714222418866


In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", round(accuracy, 3))


Accuracy: 0.34


In [None]:
import numpy as np

probs = model.predict_proba(X_test)
classes = model.classes_

def topk_accuracy(k):
    topk = np.argsort(probs, axis=1)[:, -k:]
    return np.mean([
        y_test.iloc[i] in classes[topk[i]]
        for i in range(len(y_test))
    ])

for k in [2, 3, 5]:
    print(f"Top-{k} Accuracy:", round(topk_accuracy(k), 3))


Top-2 Accuracy: 0.355
Top-3 Accuracy: 0.372
Top-5 Accuracy: 0.398


In [None]:
import pandas as pd

pd.Series(y_pred).value_counts(normalize=True)


Unnamed: 0,proportion
Normal,0.2000
Endometriosis,0.1584
Autoimmune orchitis,0.0716
Autoimmune oophoritis,0.0672
Myasthenia gravis,0.0192
...,...
Autoimmune retinopathy,0.0008
Urticarial vasculitis,0.0004
Myositis,0.0004
Diabetes mellitus type 1,0.0004


Stage one "normal vs all"