In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Define dataset paths
italy_path = "/content/drive/My Drive/Italy.xlsx"
india_path = "/content/drive/My Drive/India.xlsx"
dataset_path = "/content/drive/MyDrive/dataset anemia-20250321T094817Z-001/dataset anemia"

In [3]:
import os

if os.path.exists(italy_path) and os.path.exists(india_path):
    print("✅ Label files found!")
else:
    print("❌ Files missing! Check Google Drive path.")

✅ Label files found!


In [4]:
import pandas as pd

# Load the labels
italy_df = pd.read_excel("/content/drive/My Drive/Italy.xlsx")
india_df = pd.read_excel("/content/drive/My Drive/India.xlsx")

# Convert "Number" column to string for proper folder matching
italy_df["Number"] = italy_df["Number"].astype(str)
india_df["Number"] = india_df["Number"].astype(str)

# Add country identifiers
italy_df["Country"] = "Italy"
india_df["Country"] = "India"

# Combine both datasets
labels_df = pd.concat([italy_df, india_df], ignore_index=True)

# 🔹 Clean the "Hgb" column
labels_df["Hgb"] = labels_df["Hgb"].astype(str)  # Convert to string
labels_df["Hgb"] = labels_df["Hgb"].str.replace(",", ".")  # Replace commas with dots
labels_df["Hgb"] = labels_df["Hgb"].str.extract(r'(\d+\.\d+|\d+)')  # Keep only numbers
labels_df = labels_df.dropna(subset=["Hgb"])  # Drop missing values
labels_df["Hgb"] = labels_df["Hgb"].astype(float)  # Convert to float

# Define anemia threshold based on gender
def classify_anemia(row):
    if row["Gender"] == "M":
        return 1 if row["Hgb"] < 13 else 0
    else:
        return 1 if row["Hgb"] < 12 else 0

# Apply anemia classification
labels_df["Anemia"] = labels_df.apply(classify_anemia, axis=1)

# Save processed labels
labels_df.to_csv("/content/drive/My Drive/processed_labels.csv", index=False)

# Display processed labels
print(labels_df)

    Number   Hgb Gender  Age Country  Anemia
0        1   9.3      F   82   Italy       1
1        2  10.2      F   77   Italy       1
2        3  10.7      F   52   Italy       1
3        4  11.7      F   73   Italy       1
4        5  11.6      F   74   Italy       1
..     ...   ...    ...  ...     ...     ...
213     91  13.4      M   21   India       0
214     92  13.7      F   55   India       0
215     93  12.7      M   29   India       1
216     94  11.1      F   53   India       1
217     95  12.4      F   32   India       0

[217 rows x 6 columns]


In [5]:
import os

# Define dataset paths
dataset_path = "/content/drive/MyDrive/dataset anemia-20250321T094817Z-001/dataset anemia"
countries = ["India", "Italy"]

# Count individuals and images
for country in countries:
    country_path = os.path.join(dataset_path, country)

    if not os.path.exists(country_path):
        print(f"❌ Error: {country_path} not found!")
        continue

    print(f"\n✅ Checking {country} dataset...")
    individuals = os.listdir(country_path)
    print(f"Total individuals in {country}: {len(individuals)}")

    for person_id in individuals[:5]:  # Show only first 5 individuals
        person_folder = os.path.join(country_path, person_id)
        if os.path.isdir(person_folder):
            images = os.listdir(person_folder)
            print(f"  - Person {person_id} has {len(images)} images: {images}")


✅ Checking India dataset...
Total individuals in India: 95
  - Person 92 has 4 images: ['20200312_183626_palpebral.png', '20200312_183626_forniceal_palpebral.png', '20200312_183626_forniceal.png', '20200312_183626.jpg']
  - Person 91 has 4 images: ['20200302_231050.jpg', '20200302_231050_palpebral.png', '20200302_231050_forniceal_palpebral.png', '20200302_231050_forniceal.png']
  - Person 93 has 4 images: ['20200315_161347.jpg', '20200315_161347_palpebral.png', '20200315_161347_forniceal_palpebral.png', '20200315_161347_forniceal.png']
  - Person 9 has 4 images: ['20200131_205917_forniceal.png', '20200131_205917.jpg', '20200131_205917_forniceal_palpebral.png', '20200131_205917_palpebral.png']
  - Person 90 has 4 images: ['20200302_225922_palpebral.png', '20200302_225922.jpg', '20200302_225922_forniceal_palpebral.png', '20200302_225922_forniceal.png']

✅ Checking Italy dataset...
Total individuals in Italy: 123
  - Person 98 has 4 images: ['T_67_20190613_080656_forniceal.png', 'T_67_20

In [6]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm  # For progress tracking

# Define dataset path
# dataset_path = "/content/drive/My Drive/dataset anemia"
# The above path seems incorrect. Please update with correct path
dataset_path = "/content/drive/MyDrive/dataset anemia-20250321T094817Z-001/dataset anemia"
processed_labels_path = "/content/drive/My Drive/processed_labels.csv"
IMG_SIZE = (224, 224)  # Image resize dimensions

# Load anemia labels
labels_df = pd.read_csv(processed_labels_path, dtype={'Number': str})

# Initialize lists to store image data and labels
image_data = []
labels = []

# Process images for both India and Italy datasets
for country in ["India", "Italy"]:
    country_path = os.path.join(dataset_path, country)

    for person_id in tqdm(os.listdir(country_path), desc=f"Processing {country} images"):
        person_folder = os.path.join(country_path, person_id)

        if os.path.isdir(person_folder):
            # Check if this individual has a corresponding label
            label_row = labels_df[(labels_df["Number"] == person_id) & (labels_df["Country"] == country)]
            if label_row.empty:
                print(f"❌ Warning: No label found for {country} - Person {person_id}. Skipping...")
                continue

            # Get anemia label (0 = No Anemia, 1 = Anemia)
            anemia_label = label_row["Anemia"].values[0]

            # Process all images in the folder
            person_images = []
            image_files = [os.path.join(person_folder, img) for img in os.listdir(person_folder) if img.endswith(('.jpg', '.png'))]

            # Ensure exactly 4 images per person (skip if more or less)
            if len(image_files) == 4:
                for img_path in image_files:
                    img = cv2.imread(img_path)
                    if img is not None:
                        img = cv2.resize(img, IMG_SIZE)  # Resize image
                        img = img / 255.0  # Normalize pixel values
                        person_images.append(img)

                # Add data if all 4 images are processed
                if len(person_images) == 4:
                    image_data.append(np.array(person_images))
                    labels.append(anemia_label)
            else:
                print(f"⚠ Skipping {person_id} - Found {len(image_files)} images instead of 4.")

# Convert lists to NumPy arrays
image_data = np.array(image_data, dtype=np.float32)
labels = np.array(labels, dtype=np.int32)

# Save preprocessed data
np.save("/content/drive/My Drive/image_data.npy", image_data)
np.save("/content/drive/My Drive/labels.npy", labels)

print(f"✅ Image data shape: {image_data.shape}")  # Expected: (Total Individuals, 4, 224, 224, 3)
print(f"✅ Labels shape: {labels.shape}")  # Expected: (Total Individuals,)

Processing India images:  26%|██▋       | 25/95 [00:57<02:59,  2.57s/it]

⚠ Skipping 8 - Found 6 images instead of 4.


Processing India images: 100%|██████████| 95/95 [03:39<00:00,  2.31s/it]
Processing Italy images:   2%|▏         | 2/123 [00:04<04:26,  2.20s/it]



Processing Italy images:  20%|██        | 25/123 [00:54<03:50,  2.35s/it]

⚠ Skipping 75 - Found 2 images instead of 4.


Processing Italy images:  33%|███▎      | 40/123 [01:26<03:08,  2.28s/it]

⚠ Skipping 54 - Found 2 images instead of 4.


Processing Italy images:  38%|███▊      | 47/123 [01:41<02:51,  2.25s/it]

⚠ Skipping 58 - Found 2 images instead of 4.


Processing Italy images:  57%|█████▋    | 70/123 [02:33<02:19,  2.64s/it]

⚠ Skipping 35 - Found 2 images instead of 4.


Processing Italy images:  95%|█████████▌| 117/123 [04:16<00:12,  2.10s/it]

⚠ Skipping 109 - Found 2 images instead of 4.


Processing Italy images:  98%|█████████▊| 121/123 [04:23<00:03,  1.91s/it]

⚠ Skipping 1 - Found 2 images instead of 4.


Processing Italy images: 100%|██████████| 123/123 [04:25<00:00,  2.16s/it]


✅ Image data shape: (210, 4, 224, 224, 3)
✅ Labels shape: (210,)


In [7]:
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

# Load the processed dataset
X = np.load("/content/drive/My Drive/image_data.npy")  # Shape: (210, 4, 224, 224, 3)
y = np.load("/content/drive/My Drive/labels.npy")  # Shape: (210,)

# Flatten images to apply SMOTE
X_flattened = X.reshape(X.shape[0], -1)  # Shape: (210, 4*224*224*3)

# Check class distribution before SMOTE
unique, counts = np.unique(y, return_counts=True)
print(f"Before SMOTE - Class distribution: {dict(zip(unique, counts))}")

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_flattened, y)

# Reshape back to original image format
X_resampled = X_resampled.reshape(-1, 4, 224, 224, 3)

# Save balanced dataset
np.save("/content/drive/My Drive/X_resampled.npy", X_resampled)
np.save("/content/drive/My Drive/y_resampled.npy", y_resampled)

# Check class distribution after SMOTE
unique, counts = np.unique(y_resampled, return_counts=True)
print(f"After SMOTE - Class distribution: {dict(zip(unique, counts))}")

print(f"✅ Resampled data shape: {X_resampled.shape}")
print(f"✅ Resampled labels shape: {y_resampled.shape}")

Before SMOTE - Class distribution: {np.int32(0): np.int64(120), np.int32(1): np.int64(90)}
After SMOTE - Class distribution: {np.int32(0): np.int64(120), np.int32(1): np.int64(120)}
✅ Resampled data shape: (240, 4, 224, 224, 3)
✅ Resampled labels shape: (240,)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
import numpy as np

# Load balanced dataset
X_resampled = np.load("/content/drive/My Drive/X_resampled.npy")
y_resampled = np.load("/content/drive/My Drive/y_resampled.npy")

# Flatten images for MLP input
X_resampled = X_resampled.reshape(X_resampled.shape[0], -1)

# Define MLP Model
mlp_model = Sequential([
    Dense(512, activation="relu", input_shape=(X_resampled.shape[1],)),
    Dropout(0.3),
    Dense(256, activation="relu"),
    Dropout(0.3),
    Dense(128, activation="relu"),
    Dense(1, activation="sigmoid")  # Binary classification (Anemia vs. Non-Anemia)
])

# Compile the model
mlp_model.compile(optimizer=Adam(learning_rate=0.0001), loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
mlp_history = mlp_model.fit(X_resampled, y_resampled, epochs=30, batch_size=16, validation_split=0.2)

# Save the trained model
mlp_model.save("/content/drive/My Drive/mlp_model.h5")

print("✅ MLP Model Training Completed!")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 6s/step - accuracy: 0.4777 - loss: 4.7071 - val_accuracy: 0.5208 - val_loss: 3.6592
Epoch 2/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 6s/step - accuracy: 0.6510 - loss: 8.5392 - val_accuracy: 0.3333 - val_loss: 6.7084
Epoch 3/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 6s/step - accuracy: 0.6632 - loss: 6.2539 - val_accuracy: 0.7708 - val_loss: 1.9048
Epoch 4/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 6s/step - accuracy: 0.5798 - loss: 6.7891 - val_accuracy: 0.3750 - val_loss: 7.4418
Epoch 5/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 6s/step - accuracy: 0.6338 - loss: 5.8245 - val_accuracy: 0.6667 - val_loss: 1.7947
Epoch 6/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 6s/step - accuracy: 0.6219 - loss: 4.8914 - val_accuracy: 0.3750 - val_loss: 4.2610
Epoch 7/30
[1m12/12[0m [32m━━━━━━━━━━



✅ MLP Model Training Completed!


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import joblib

# Load balanced dataset
X_resampled = np.load("/content/drive/My Drive/X_resampled.npy")
y_resampled = np.load("/content/drive/My Drive/y_resampled.npy")

# Flatten images for Random Forest
X_resampled = X_resampled.reshape(X_resampled.shape[0], -1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define and train Random Forest model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)

# Save the trained Random Forest model
joblib.dump(rf_model, "/content/drive/My Drive/random_forest_model.pkl")

print(f"✅ Random Forest Model Training Completed! Accuracy: {rf_accuracy:.4f}")

✅ Random Forest Model Training Completed! Accuracy: 0.6667


In [9]:
import numpy as np
import tensorflow as tf
import joblib
from sklearn.metrics import accuracy_score

# Load saved models
mlp_model = tf.keras.models.load_model("/content/drive/My Drive/mlp_model.h5")
rf_model = joblib.load("/content/drive/My Drive/random_forest_model.pkl")

# Load test dataset
X_resampled = np.load("/content/drive/My Drive/X_resampled.npy")
y_resampled = np.load("/content/drive/My Drive/y_resampled.npy")

# Flatten images for Random Forest
X_flattened = X_resampled.reshape(X_resampled.shape[0], -1)

# Get predictions from MLP model
mlp_preds = mlp_model.predict(X_flattened).flatten()

# Get predictions from Random Forest
rf_preds = rf_model.predict_proba(X_flattened)[:, 1]  # Get probability of class 1 (Anemia)

# Combine predictions (Weighted Average)
final_preds = (0.6 * mlp_preds) + (0.4 * rf_preds)
final_labels = (final_preds > 0.5).astype(int)  # Convert probabilities to binary labels

# Evaluate Ensemble Model
ensemble_accuracy = accuracy_score(y_resampled, final_labels)
print(f"✅ Ensemble Model Accuracy: {ensemble_accuracy:.4f}")



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 795ms/step
✅ Ensemble Model Accuracy: 0.9542


In [10]:
import numpy as np
import tensorflow as tf
import joblib
from sklearn.metrics import accuracy_score

# Load saved models
mlp_model = tf.keras.models.load_model("/content/drive/My Drive/mlp_model.h5")
rf_model = joblib.load("/content/drive/My Drive/random_forest_model.pkl")

# Load test dataset
X_resampled = np.load("/content/drive/My Drive/X_resampled.npy")
y_resampled = np.load("/content/drive/My Drive/y_resampled.npy")

# Flatten images for Random Forest
X_flattened = X_resampled.reshape(X_resampled.shape[0], -1)

# Get predictions from MLP model
mlp_preds = mlp_model.predict(X_flattened).flatten()

# Get predictions from Random Forest
rf_preds = rf_model.predict_proba(X_flattened)[:, 1]  # Get probability of class 1 (Anemia)

# Combine predictions (Weighted Average)
final_preds = (0.6 * mlp_preds) + (0.4 * rf_preds)
final_labels = (final_preds > 0.5).astype(int)  # Convert probabilities to binary labels

# Save final labels
np.save("/content/drive/My Drive/final_labels.npy", final_labels)

print(f"✅ Final labels saved as 'final_labels.npy'")



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 516ms/step
✅ Final labels saved as 'final_labels.npy'


In [11]:
import numpy as np
import pandas as pd

# Load final predictions
final_labels = np.load("/content/drive/My Drive/final_labels.npy")

# Count individuals with and without anemia
total_anemic = np.sum(final_labels)
total_non_anemic = len(final_labels) - total_anemic

# Save report as CSV
report_df = pd.DataFrame({"Anemia_Status": final_labels})
report_df.to_csv("/content/drive/My Drive/anemia_detection_report.csv", index=False)

# Print results
print(f"✅ Total Individuals: {len(final_labels)}")
print(f"🔴 Anemic Individuals: {total_anemic}")
print(f"🟢 Non-Anemic Individuals: {total_non_anemic}")
print("📄 Report saved as 'anemia_detection_report.csv'")

✅ Total Individuals: 240
🔴 Anemic Individuals: 121
🟢 Non-Anemic Individuals: 119
📄 Report saved as 'anemia_detection_report.csv'


In [12]:
import numpy as np
import pandas as pd

# Load data
final_labels = np.load("/content/drive/My Drive/final_labels.npy")  # 240 values
labels_df = pd.read_csv("/content/drive/My Drive/processed_labels.csv", dtype={'Number': str})  # 217 individuals

# Ensure both arrays have the same length
final_labels = final_labels[:len(labels_df)]  # Trim extra values

# Add predicted anemia status
labels_df["Predicted_Anemia"] = final_labels

In [13]:
# Count individuals in each category for India and Italy
report = labels_df.groupby(["Country", "Predicted_Anemia"]).size().unstack().fillna(0)
report.columns = ["Non-Anemic", "Anemic"]  # Rename columns for clarity

# Save the detailed report
report.to_csv("/content/drive/My Drive/anemia_country_report.csv")

# Display the report
print("✅ Country-Wise Anemia Distribution:")
print(report)
print("📄 Report saved as 'anemia_country_report.csv'")

✅ Country-Wise Anemia Distribution:
         Non-Anemic  Anemic
Country                    
India            69      26
Italy            50      72
📄 Report saved as 'anemia_country_report.csv'


In [17]:
# Load final predictions and actual labels
final_labels = np.load("/content/drive/My Drive/final_labels.npy")
y_true = np.load("/content/drive/My Drive/y_resampled.npy")  # True labels from SMOTE-balanced dataset
from sklearn.metrics import accuracy_score, f1_score, recall_score # Import f1_score and recall_score

# Calculate metrics
accuracy = accuracy_score(y_true, final_labels)
f1 = f1_score(y_true, final_labels)
sensitivity = recall_score(y_true, final_labels)  # Sensitivity is the same as Recall
error_rate = 1 - accuracy  # Error Rate = 1 - Accuracy

# Print results
print(f"✅ Model Evaluation Metrics:")
print(f"📊 Accuracy: {accuracy:.2f}")
print(f"🎯 F1-Score: {f1:.4f}")
print(f"🔍 Sensitivity (Recall): {sensitivity:.4f}")
print(f"❌ Error Rate: {error_rate:.4f}")

✅ Model Evaluation Metrics:
📊 Accuracy: 0.95
🎯 F1-Score: 0.9544
🔍 Sensitivity (Recall): 0.9583
❌ Error Rate: 0.0458
