<a href="https://colab.research.google.com/github/KusalaniR/MedGen.AI/blob/main/notebooks/Blood_Test_Status_Classification_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import pandas for data handling
import pandas as pd

# Import scikit-learn for ML
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Optional: for saving the trained model
import joblib


In [3]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load your cleaned, final dataset
df = pd.read_csv("/content/drive/MyDrive/MedGen.AI Datasets/FINALIZED DATASETS/final_merged_all_blood_tests_dataset.csv", encoding='latin1')

# Quick view
df.head()


Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,...,normal_range,unit_x,low_meaning,high_meaning,simple_explanation_en,gender,low_threshold,high_threshold,unit_y,status
0,172061,10014354,29600294.0,1808066,51277,,2148-08-16 00:00:00,2148-08-16 01:30:00,15.4,15.4,...,11ÃÂ15,%,Usually not significant,Variation in red blood cell size,RDW shows how much your red blood cells differ...,Any,11.0,15.0,%,High
1,172062,10014354,29600294.0,1808066,51279,,2148-08-16 00:00:00,2148-08-16 01:30:00,3.35,3.35,...,"4.2ÃÂ5.4 (Female), 4.7ÃÂ6.1 (Male)",m/uL,May reduce oxygen delivery,May thicken blood,Red blood cells carry oxygen from your lungs t...,Any,,,,Normal
2,172050,10014354,29600294.0,1808066,51249,,2148-08-16 00:00:00,2148-08-16 01:30:00,31.1,31.1,...,32ÃÂ36,g/dL,Red blood cells may carry less oxygen,Rarely significant,MCHC shows how concentrated hemoglobin is in r...,Any,32.0,36.0,g/dL,Low
3,172044,10014354,29600294.0,1808066,51221,,2148-08-16 00:00:00,2148-08-16 01:30:00,29.6,29.6,...,"36ÃÂ46% (Female), 41ÃÂ53% (Male)",%,May indicate anemia,May indicate dehydration,Hematocrit shows the percentage of red blood c...,Any,,,,Normal
4,172045,10014354,29600294.0,1808066,51222,,2148-08-16 00:00:00,2148-08-16 01:30:00,9.2,9.2,...,"12ÃÂ15.5 (Female), 13.5ÃÂ17.5 (Male)",g/dL,May indicate low oxygen-carrying capacity,May indicate dehydration,Hemoglobin is a protein in red blood cells tha...,Any,,,,Normal


In [5]:
#prepare features and labels
# Feature columns
# We'll use:
# - valuenum: the lab value
# - test_name: categorical feature (encoded)
# - low_threshold, high_threshold: numeric thresholds

features = df[['valuenum', 'low_threshold', 'high_threshold', 'test_name']]

# Label column
labels = df['status']  # Low, Normal, High


In [6]:
#Encode categorical features
# Convert test_name (categorical) into numeric values
le = LabelEncoder()
features['test_name_encoded'] = le.fit_transform(features['test_name'])

# Drop original test_name column (we now have numeric)
features = features.drop('test_name', axis=1)

features.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['test_name_encoded'] = le.fit_transform(features['test_name'])


Unnamed: 0,valuenum,low_threshold,high_threshold,test_name_encoded
0,15.4,11.0,15.0,8
1,3.35,,,9
2,31.1,32.0,36.0,5
3,29.6,,,2
4,9.2,,,3


In [7]:
#Encode labels
# Convert Low/Normal/High into numbers
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# 0 = High, 1 = Low, 2 = Normal (order may vary, check mapping)
print(list(label_encoder.classes_))  # Just to check mapping


['High', 'Low', 'Normal']


In [8]:
#Split dataset (Train/Test)
# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    features, labels_encoded, test_size=0.2, random_state=42
)


In [9]:
#Train Decision Tree
# Initialize Decision Tree classifier
clf = DecisionTreeClassifier(
    criterion='entropy',   # Measures information gain
    max_depth=5,           # Avoid overfitting
    random_state=42
)

# Train the model
clf.fit(X_train, y_train)


In [10]:
#Evaluate the model
# Predict on test set
y_pred = clf.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Detailed report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Model Accuracy: 0.908513593967057
              precision    recall  f1-score   support

        High       1.00      0.78      0.87       920
         Low       1.00      0.50      0.67       513
      Normal       0.89      1.00      0.94      3606

    accuracy                           0.91      5039
   macro avg       0.96      0.76      0.83      5039
weighted avg       0.92      0.91      0.90      5039



In [12]:
#Visualize / Inspect the tree
# Export tree rules in text
tree_rules = export_text(clf, feature_names=list(features.columns))
print(tree_rules)


|--- high_threshold <= inf
|   |--- test_name_encoded <= 2.50
|   |   |--- valuenum <= 99.50
|   |   |   |--- valuenum <= 69.50
|   |   |   |   |--- class: 1
|   |   |   |--- valuenum >  69.50
|   |   |   |   |--- class: 2
|   |   |--- valuenum >  99.50
|   |   |   |--- low_threshold <= 35.00
|   |   |   |   |--- valuenum <= 200.00
|   |   |   |   |   |--- class: 2
|   |   |   |   |--- valuenum >  200.00
|   |   |   |   |   |--- class: 0
|   |   |   |--- low_threshold >  35.00
|   |   |   |   |--- class: 0
|   |--- test_name_encoded >  2.50
|   |   |--- test_name_encoded <= 7.50
|   |   |   |--- valuenum <= 31.95
|   |   |   |   |--- high_threshold <= 34.50
|   |   |   |   |   |--- class: 2
|   |   |   |   |--- high_threshold >  34.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- valuenum >  31.95
|   |   |   |   |--- low_threshold <= 115.00
|   |   |   |   |   |--- class: 2
|   |   |   |   |--- low_threshold >  115.00
|   |   |   |   |   |--- class: 2
|   |   |--- test_name_encod

In [18]:
# Save model for later use in RAG + AI pipeline
joblib.dump(clf, "/content/drive/MyDrive/MedGen.AI Datasets/FINALIZED DATASETS/blood status classifier model/blood_status_classifier.pkl")


['/content/drive/MyDrive/MedGen.AI Datasets/FINALIZED DATASETS/blood status classifier model/blood_status_classifier.pkl']

In [20]:
#Test the model with new rows
import pandas as pd

# New blood test samples (model has NEVER seen these)
test_input = pd.DataFrame({
    'test_name': [
        'Hemoglobin',
        'Glucose',
        'Platelet Count',
        'WBC Count'
    ],
    'valuenum': [
        9.5,     # Low hemoglobin
        180,     # High glucose
        250,     # Normal platelet
        3.2      # Low WBC
    ],
    'low_threshold': [
        12, 70, 150, 4
    ],
    'high_threshold': [
        15.5, 140, 400, 10
    ]
})



In [22]:
# Encode test names - because model cant undertand test so we neet convert them to numbers
test_input['test_name_encoded'] = le.transform(test_input['test_name'])


In [23]:
# Select same feature columns used during training
X_test_manual = test_input[['valuenum', 'low_threshold', 'high_threshold', 'test_name_encoded']]

# Predict
predicted_encoded = clf.predict(X_test_manual)

# Convert numeric prediction back to text labels
predicted_status = label_encoder.inverse_transform(predicted_encoded)

# Show results
test_input['Predicted_Status'] = predicted_status
test_input


Unnamed: 0,test_name,valuenum,low_threshold,high_threshold,test_name_encoded,Predicted_Status
0,Hemoglobin,9.5,12,15.5,3,Normal
1,Glucose,180.0,70,140.0,1,High
2,Platelet Count,250.0,150,400.0,7,Normal
3,WBC Count,3.2,4,10.0,11,Low


In [25]:
#TESTING
# Encode test names using SAME encoder as training
test_input['test_name_encoded'] = le.transform(test_input['test_name'])

# Match training feature order
X_test_manual = test_input[['valuenum', 'low_threshold', 'high_threshold', 'test_name_encoded']]

# Predict
predicted_encoded = clf.predict(X_test_manual)

# Decode prediction
predicted_status = label_encoder.inverse_transform(predicted_encoded)

test_input['Predicted_Status'] = predicted_status
test_input


Unnamed: 0,test_name,valuenum,low_threshold,high_threshold,test_name_encoded,Predicted_Status
0,Hemoglobin,9.5,12,15.5,3,Normal
1,Glucose,180.0,70,140.0,1,High
2,Platelet Count,250.0,150,400.0,7,Normal
3,WBC Count,3.2,4,10.0,11,Low
