In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rajumavinmar/finger-print-based-blood-group-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rajumavinmar/finger-print-based-blood-group-dataset?dataset_version_number=1...


100%|██████████| 58.1M/58.1M [00:00<00:00, 184MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1


# Task
Build a smart biometric system for instant blood group identification using fingerprint images. The system should achieve at least 95% accuracy. Save the trained model so it can be plugged into a full-stack application.

## Data loading and exploration

### Subtask:
Load the dataset and explore its structure, content, and characteristics. This includes understanding the file organization, image formats, and labels.


In [3]:
import os
from PIL import Image
import random

dataset_path = '/root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group'

# 1. Inspect the directory structure and count images per class
class_counts = {}
print("Dataset directory structure and image counts per class:")
for root, dirs, files in os.walk(dataset_path):
    if root != dataset_path:
        class_name = os.path.basename(root)
        image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        class_counts[class_name] = len(image_files)
        print(f"Directory: {root}, Class: {class_name}, Number of images: {len(image_files)}")

# 2. Print class distribution
print("\nClass distribution:")
for class_name, count in class_counts.items():
    print(f"{class_name}: {count}")

# 3. Examine a few sample images from different classes
print("\nSample images from different classes:")
for class_name in class_counts.keys():
    class_dir = os.path.join(dataset_path, class_name)
    image_files = [f for f in os.listdir(class_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    if image_files:
        sample_image_name = random.choice(image_files)
        sample_image_path = os.path.join(class_dir, sample_image_name)
        try:
            with Image.open(sample_image_path) as img:
                print(f"  Class: {class_name}, Sample Image: {sample_image_name}, Format: {img.format}, Size: {img.size}, Mode: {img.mode}")
        except Exception as e:
            print(f"  Could not open image {sample_image_name} in class {class_name}: {e}")

Dataset directory structure and image counts per class:
Directory: /root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group/B+, Class: B+, Number of images: 0
Directory: /root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group/O+, Class: O+, Number of images: 0
Directory: /root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group/A+, Class: A+, Number of images: 0
Directory: /root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group/B-, Class: B-, Number of images: 0
Directory: /root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group/AB-, Class: AB-, Number of images: 0
Directory: /root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group/A-, Class: 

In [4]:
import os

dataset_path = '/root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group'

print(f"Listing contents of: {dataset_path}")
try:
    for item in os.listdir(dataset_path):
        item_path = os.path.join(dataset_path, item)
        if os.path.isdir(item_path):
            print(f"  Directory: {item}")
            # Optionally list contents of subdirectories if needed
            # for sub_item in os.listdir(item_path):
            #     print(f"    - {sub_item}")
        else:
            print(f"  File: {item}")
except FileNotFoundError:
    print(f"Error: Directory not found at {dataset_path}")


Listing contents of: /root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group
  Directory: B+
  Directory: O+
  Directory: A+
  Directory: B-
  Directory: AB-
  Directory: A-
  Directory: O-
  Directory: AB+


**Reasoning**:
The listing of the `dataset_blood_group` directory shows that the class directories (B+, O+, etc.) exist, but the previous attempt to count images within them resulted in zero counts and a FileNotFoundError when trying to sample images. This suggests the image files are not directly inside these top-level class directories. I need to modify the image counting and sampling logic to search for image files within these class directories and any potential subdirectories they might contain.



In [5]:
import os
from PIL import Image
import random

dataset_path = '/root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group'

# 1. Inspect the directory structure and count images per class by walking through subdirectories
class_counts = {}
print("Dataset directory structure and image counts per class:")
for root, dirs, files in os.walk(dataset_path):
    class_name = os.path.basename(root)
    image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    # Only count files in the actual class directories, not the root dataset_blood_group dir
    if root != dataset_path:
        if class_name not in class_counts:
            class_counts[class_name] = 0
        class_counts[class_name] += len(image_files)
        if len(image_files) > 0:
             print(f"Directory: {root}, Class: {class_name}, Number of images: {len(image_files)}")


# 2. Print class distribution
print("\nClass distribution:")
for class_name, count in class_counts.items():
    print(f"{class_name}: {count}")

# 3. Examine a few sample images from different classes
print("\nSample images from different classes:")
for class_name, count in class_counts.items():
    if count > 0:
        # Find all image files within the class directory and its subdirectories
        class_images = []
        class_dir = os.path.join(dataset_path, class_name)
        for root, dirs, files in os.walk(class_dir):
             for f in files:
                 if f.lower().endswith(('.png', '.jpg', '.jpeg')):
                     class_images.append(os.path.join(root, f))

        if class_images:
            sample_image_path = random.choice(class_images)
            try:
                with Image.open(sample_image_path) as img:
                    print(f"  Class: {class_name}, Sample Image: {os.path.basename(sample_image_path)}, Format: {img.format}, Size: {img.size}, Mode: {img.mode}")
            except Exception as e:
                print(f"  Could not open image {os.path.basename(sample_image_path)} in class {class_name}: {e}")

Dataset directory structure and image counts per class:

Class distribution:
B+: 0
O+: 0
A+: 0
B-: 0
AB-: 0
A-: 0
O-: 0
AB+: 0

Sample images from different classes:


In [6]:
import os
from PIL import Image
import random

dataset_path = '/root/.cache/kagglehub/datasets/rajumavinmar/finger-print-based-blood-group-dataset/versions/1/dataset_blood_group'

# Dictionary to store image paths per class
class_images = {}

print("Searching for image files and counting per class...")

# Walk through all directories and files within the dataset_blood_group
for root, dirs, files in os.walk(dataset_path):
    # Determine the class name based on the directory path
    # We assume the immediate subdirectory of dataset_blood_group is the class name
    relative_path = os.path.relpath(root, dataset_path)
    if relative_path != '.': # Exclude the root dataset_blood_group directory itself
        class_name = relative_path.split(os.sep)[0]

        if class_name not in class_images:
            class_images[class_name] = []

        for file in files:
            # Check for common image extensions
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
                file_path = os.path.join(root, file)
                class_images[class_name].append(file_path)


# 1. Print image counts per class
print("\nImage counts per class:")
class_counts = {class_name: len(images) for class_name, images in class_images.items()}
for class_name, count in class_counts.items():
    print(f"Class: {class_name}, Number of images: {count}")

# 2. Print class distribution
print("\nClass distribution:")
for class_name, count in class_counts.items():
    print(f"{class_name}: {count}")


# 3. Examine a few sample images from different classes
print("\nSample images from different classes:")
for class_name, images in class_images.items():
    if images:
        sample_image_path = random.choice(images)
        try:
            with Image.open(sample_image_path) as img:
                print(f"  Class: {class_name}, Sample Image: {os.path.basename(sample_image_path)}, Format: {img.format}, Size: {img.size}, Mode: {img.mode}")
        except Exception as e:
            print(f"  Could not open image {os.path.basename(sample_image_path)} in class {class_name}: {e}")

# 4. Note any potential issues (based on counts and sample inspection)
print("\nPotential issues noted during inspection:")
# Check for classes with zero images
for class_name, count in class_counts.items():
    if count == 0:
        print(f"- Class '{class_name}' has no images.")

# Note: Further issues like inconsistent sizes, formats, or quality would be observed during manual inspection of more samples or through automated analysis.

Searching for image files and counting per class...

Image counts per class:
Class: B+, Number of images: 652
Class: O+, Number of images: 852
Class: A+, Number of images: 565
Class: B-, Number of images: 741
Class: AB-, Number of images: 761
Class: A-, Number of images: 1009
Class: O-, Number of images: 712
Class: AB+, Number of images: 708

Class distribution:
B+: 652
O+: 852
A+: 565
B-: 741
AB-: 761
A-: 1009
O-: 712
AB+: 708

Sample images from different classes:
  Class: B+, Sample Image: cluster_2_5746.BMP, Format: BMP, Size: (96, 103), Mode: RGBA
  Class: O+, Sample Image: cluster_6_3536.BMP, Format: BMP, Size: (96, 103), Mode: RGBA
  Class: A+, Sample Image: cluster_0_3044.BMP, Format: BMP, Size: (96, 103), Mode: RGBA
  Class: B-, Sample Image: cluster_3_1886.BMP, Format: BMP, Size: (96, 103), Mode: RGBA
  Class: AB-, Sample Image: cluster_5_4600.BMP, Format: BMP, Size: (96, 103), Mode: RGBA
  Class: A-, Sample Image: cluster_1_3749.BMP, Format: BMP, Size: (96, 103), Mode: RGBA


## Data preprocessing

### Subtask:
Preprocess the fingerprint images. This may involve resizing, normalization, and potentially data augmentation to increase the size and diversity of the training set.


In [22]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# 1. Define the target image size
TARGET_SIZE = (128, 128)

preprocessed_images = []
labels = []
label_map = {class_name: i for i, class_name in enumerate(class_images.keys())}

# 2. Iterate through the class_images dictionary
for class_name, image_paths in class_images.items():
    print(f"Preprocessing images for class: {class_name}")
    for image_path in image_paths:
        try:
            # a. Load each image using PIL
            with Image.open(image_path) as img:
                # b. Convert the image to RGB format
                if img.mode != 'RGB':
                    img = img.convert('RGB')

                # c. Resize the image to the target size
                img = img.resize(TARGET_SIZE)

                # Convert image to numpy array
                img_array = np.array(img)

                # d. Normalize the pixel values
                img_array = img_array / 255.0

                # e. Store the preprocessed image data and its corresponding class label
                preprocessed_images.append(img_array)
                labels.append(label_map[class_name])

        except Exception as e:
            print(f"Could not process image {os.path.basename(image_path)}: {e}")

# 3. Convert lists to NumPy arrays
preprocessed_images = np.array(preprocessed_images)
labels = np.array(labels)

print(f"\nShape of preprocessed_images: {preprocessed_images.shape}")
print(f"Shape of labels: {labels.shape}")

# 4. Split the preprocessed data into training, validation, and test sets (Stratified Split)
# First split: train and temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    preprocessed_images, labels, test_size=0.2, random_state=42, stratify=labels
)

# Second split: validation and test from temp
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"\nShape of X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Shape of X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"Shape of X_test: {X_test.shape}, y_test: {y_test.shape}")


Preprocessing images for class: B+
Preprocessing images for class: O+
Preprocessing images for class: A+
Preprocessing images for class: B-
Preprocessing images for class: AB-
Preprocessing images for class: A-
Preprocessing images for class: O-
Preprocessing images for class: AB+

Shape of preprocessed_images: (6000, 128, 128, 3)
Shape of labels: (6000,)

Shape of X_train: (4800, 128, 128, 3), y_train: (4800,)
Shape of X_val: (600, 128, 128, 3), y_val: (600,)
Shape of X_test: (600, 128, 128, 3), y_test: (600,)


## Model selection and building

### Subtask:
Choose a suitable deep learning model architecture for image classification. Since fingerprint images are involved, a Convolutional Neural Network (CNN) based model would be a good starting point.


In [9]:
%pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow)
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard~=2.20.0->tensorflow)
  Downloading tensorboard_data_server-0.

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# 2. Define a sequential model
model = Sequential()

# 3. Add convolutional layers
# First convolutional layer
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(TARGET_SIZE[0], TARGET_SIZE[1], 3)))
# 4. Add pooling layers
model.add(MaxPooling2D((2, 2)))

# Second convolutional layer
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))

# Third convolutional layer
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))

# 5. Flatten the output of the convolutional layers
model.add(Flatten())

# 6. Add dense (fully connected) layers
model.add(Dense(128, activation='relu'))

# 7. Add the output layer
num_classes = len(class_counts)
model.add(Dense(num_classes, activation='softmax'))

# 8. Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Print the model summary
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Model training

### Subtask:
Train the selected model on the preprocessed dataset. This involves splitting the data into training and validation sets, defining the loss function and optimizer, and training the model for a sufficient number of epochs.


In [11]:
from tensorflow.keras.utils import to_categorical

# 1. One-hot encode the training, validation, and testing labels
y_train_encoded = to_categorical(y_train, num_classes=num_classes)
y_val_encoded = to_categorical(y_val, num_classes=num_classes)
y_test_encoded = to_categorical(y_test, num_classes=num_classes) # Although not used in training, it's good practice to encode it now.

# 2. Train the compiled model
epochs = 10
batch_size = 32

history = model.fit(
    X_train, y_train_encoded,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_val, y_val_encoded)
)

Epoch 1/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 356ms/step - accuracy: 0.4416 - loss: 1.4788 - val_accuracy: 0.8050 - val_loss: 0.4699
Epoch 2/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 351ms/step - accuracy: 0.8316 - loss: 0.4364 - val_accuracy: 0.9000 - val_loss: 0.3035
Epoch 3/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 350ms/step - accuracy: 0.8813 - loss: 0.3108 - val_accuracy: 0.8067 - val_loss: 0.5544
Epoch 4/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 350ms/step - accuracy: 0.8875 - loss: 0.2950 - val_accuracy: 0.8933 - val_loss: 0.2784
Epoch 5/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 348ms/step - accuracy: 0.9191 - loss: 0.2160 - val_accuracy: 0.9133 - val_loss: 0.2206
Epoch 6/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 348ms/step - accuracy: 0.9172 - loss: 0.2192 - val_accuracy: 0.9067 - val_loss: 0.2519
Epoch 7/10

## Model Optimization

### Subtask:
If the desired accuracy is not achieved, optimize the model by tuning hyperparameters, trying different model architectures, or incorporating advanced techniques like transfer learning.

In [19]:
# 1. Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test_encoded, verbose=0)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# 2. (Optional) Further evaluation metrics: Precision, Recall, F1-score
# To calculate these, we need to get the model's predictions
from sklearn.metrics import classification_report
import numpy as np

y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_encoded, axis=1)

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=list(label_map.keys())))

Test Loss: 0.2386
Test Accuracy: 0.9233
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 85ms/step

Classification Report:
              precision    recall  f1-score   support

          B+       0.95      0.91      0.93        65
          O+       0.97      0.86      0.91        86
          A+       0.98      0.89      0.93        56
          B-       0.93      1.00      0.96        74
         AB-       0.95      0.92      0.93        76
          A-       0.86      0.94      0.90       101
          O-       0.84      0.90      0.87        71
         AB+       0.96      0.96      0.96        71

    accuracy                           0.92       600
   macro avg       0.93      0.92      0.93       600
weighted avg       0.93      0.92      0.92       600



In [26]:
# 1. Define the path and filename for saving the model
model_save_path = "blood_group_cnn_model.keras"

# 2. Save the trained model in the native Keras format
model.save(model_save_path)

print(f"Model saved successfully to: {model_save_path}")

Model saved successfully to: blood_group_cnn_model.keras


In [27]:
from tensorflow.keras.models import load_model
from PIL import Image
import numpy as np
import io
from google.colab import files

# Load the saved model
model_save_path = "blood_group_cnn_model.keras"
loaded_model = load_model(model_save_path)

# Define the target image size (should be the same as used during training)
TARGET_SIZE = (128, 128)

# Define the label map (should be the same as used during training)
# You might need to recreate this based on the order of classes in your training data
# Assuming the order is the same as in the previous steps:
label_map = {'B+': 0, 'O+': 1, 'A+': 2, 'B-': 3, 'AB-': 4, 'A-': 5, 'O-': 6, 'AB+': 7}
# Create a reverse map for displaying results
reverse_label_map = {i: class_name for class_name, i in label_map.items()}


# Function to preprocess a single image from bytes
def preprocess_single_image_from_bytes(image_bytes, target_size):
    try:
        img = Image.open(io.BytesIO(image_bytes))
        if img.mode != 'RGB':
            img = img.convert('RGB')
        img = img.resize(target_size)
        img_array = np.array(img)
        img_array = img_array / 255.0
        # Add batch dimension
        img_array = np.expand_dims(img_array, axis=0)
        return img_array
    except Exception as e:
        print(f"Could not process image from bytes: {e}")
        return None

# Use files.upload() to get user input
uploaded = files.upload()

for filename, file_bytes in uploaded.items():
    print(f'User uploaded file "{filename}"')

    # Preprocess the uploaded image
    preprocessed_input_image = preprocess_single_image_from_bytes(file_bytes, TARGET_SIZE)

    if preprocessed_input_image is not None:
        # Make a prediction
        predictions = loaded_model.predict(preprocessed_input_image)

        # Get the predicted class
        predicted_class_index = np.argmax(predictions, axis=1)[0]
        predicted_blood_group = reverse_label_map[predicted_class_index]
        confidence = np.max(predictions) * 100

        print(f"\nPredicted Blood Group for {filename}: {predicted_blood_group}")
        print(f"Confidence: {confidence:.2f}%")
    else:
        print(f"Image preprocessing failed for {filename}.")

  saveable.load_own_variables(weights_store.get(inner_path))




Saving cluster_6_53.BMP to cluster_6_53.BMP
User uploaded file "cluster_6_53.BMP"
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step

Predicted Blood Group for cluster_6_53.BMP: O+
Confidence: 99.97%
