In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/photos/BloodImage_00088.jpg
/kaggle/input/photos/BloodImage_00403.jpg
/kaggle/input/photos/BloodImage_00092.jpg
/kaggle/input/photos/BloodImage_00147.jpg
/kaggle/input/photos/BloodImage_00097.jpg
/kaggle/input/photos/BloodImage_00195.jpg
/kaggle/input/photos/BloodImage_00094.jpg
/kaggle/input/photos/BloodImage_00283.jpg
/kaggle/input/photos/BloodImage_00148.jpg
/kaggle/input/photos/BloodImage_00123.jpg
/kaggle/input/photos/BloodImage_00022.jpg
/kaggle/input/photos/BloodImage_00109.jpg
/kaggle/input/photos/BloodImage_00350.jpg
/kaggle/input/photos/BloodImage_00176.jpg
/kaggle/input/photos/BloodImage_00044.jpg
/kaggle/input/photos/BloodImage_00133.jpg
/kaggle/input/photos/BloodImage_00087.jpg
/kaggle/input/photos/BloodImage_00290.jpg
/kaggle/input/photos/BloodImage_00071.jpg
/kaggle/input/photos/BloodImage_00243.jpg
/kaggle/input/photos/BloodImage_00024.jpg
/kaggle/input/photos/BloodImage_00091.jpg
/kaggle/input/photos/BloodImage_00001.jpg
/kaggle/input/photos/BloodImage_00

In [28]:
import os
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [29]:
def create_cnn_model(input_shape):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(256, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
    ])
    return model

In [30]:
def load_and_preprocess_images(image_paths, image_size=(128, 128)):
    images = []
    for path in image_paths:
        image = cv2.imread(path)
        if image is not None:
            resized_image = cv2.resize(image, image_size)
            images.append(resized_image)
        else:
            print(f"Warning: Could not load image at {path}")
    return np.array(images)

# Define function to extract features using custom CNN model
def extract_custom_features(model, images):
    images = images / 255.0  # Normalize pixel values
    features = model.predict(images)
    return features

In [31]:
def augment_data(images, labels):
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    augmented_images = []
    augmented_labels = []
    for image, label in zip(images, labels):
        image = image.reshape((1,) + image.shape)
        i = 0
        for batch in datagen.flow(image, batch_size=1):
            augmented_images.append(batch[0])
            augmented_labels.append(label)
            i += 1
            if i >= 5:  # Generate 5 augmented images per original image
                break
    return np.array(augmented_images), np.array(augmented_labels)

In [32]:
train_df = pd.read_csv('/kaggle/input/blood-cell-images-based-classification/train.csv')
train_df=train_df[train_df['Image']!=329]
test_df = pd.read_csv('/kaggle/input/blood-cell-images-based-classification/test.csv')
image_dir = '/kaggle/input/photos'

In [33]:
train_image_paths = [os.path.join(image_dir, f'BloodImage_{str(num).zfill(5)}.jpg') for num in train_df['Image']]
test_image_paths = [os.path.join(image_dir, f'BloodImage_{str(num).zfill(5)}.jpg') for num in test_df['Image']]

# Load and preprocess images
X_train_images = load_and_preprocess_images(train_image_paths)
X_test_images = load_and_preprocess_images(test_image_paths)

In [34]:
input_shape = (128, 128, 3)
cnn_model = create_cnn_model(input_shape)

# Extract deep features using the custom CNN model
X_train_features = extract_custom_features(cnn_model, X_train_images)
X_test_features = extract_custom_features(cnn_model, X_test_images)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 188ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step


In [36]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['Category'])
X_train_augmented, y_train_augmented = augment_data(X_train_images, y_train)

# Extract features from augmented data
X_train_features_augmented = extract_custom_features(cnn_model, X_train_augmented)

# Combine original and augmented features
X_train_combined = np.vstack((X_train_features, X_train_features_augmented))
y_train_combined = np.concatenate((y_train, y_train_augmented))

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_features, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Apply SMOTE

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 189ms/step


In [42]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_combined, y_train_combined)
# Train XGBoost model
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}



In [44]:
xgb = XGBClassifier(random_state=42)
grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)
best_xgb = grid_search.best_estimator_
# Predict on validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42, stratify=y_train_resampled)
val_pred_xgb = best_xgb.predict(X_val)
print("Optimized XGBoost Classification Report (Validation):")
print(classification_report(y_val, val_pred_xgb, target_names=label_encoder.classes_))
print("Optimized XGBoost Accuracy (Validation):", accuracy_score(y_val, val_pred_xgb))

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   2.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   2.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   4.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time=   6.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=   6.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0; total time=  12.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=100, subsample=0.8; total time=  13.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=100, subsample=1.0; total time=  14.0s
[CV] END c

In [46]:
test_pred_xgb = best_xgb.predict(X_test_features)

In [47]:
submission_df = pd.DataFrame({'Image': test_df['Image'], 'Category': label_encoder.inverse_transform(test_pred_xgb)})
submission_df.to_csv('/kaggle/working/submission_xgb_optimized.csv', index=False)