In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from sklearn.metrics import classification_report
import numpy as np
import tensorflow_datasets as tfds

In [2]:
# Load the Cats vs Dogs dataset from TensorFlow Datasets
def load_cats_vs_dogs_dataset():
    # Load the dataset with tfds
    (train_ds, test_ds), ds_info = tfds.load('cats_vs_dogs', split=['train[:80%]', 'train[80%:]'], with_info=True, as_supervised=True)

    # Normalize the images to [0, 1] range and resize to 28x28
    def preprocess_image(image, label):
        image = tf.image.resize(image, [28, 28])
        image = tf.image.rgb_to_grayscale(image)
        image = tf.cast(image, tf.float32) / 255.0
        return image, label

    train_ds = train_ds.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    test_ds = test_ds.map(preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Convert the datasets to numpy arrays
    train_images = []
    train_labels = []
    for img, lbl in tfds.as_numpy(train_ds):
        train_images.append(img)
        train_labels.append(lbl)

    test_images = []
    test_labels = []
    for img, lbl in tfds.as_numpy(test_ds):
        test_images.append(img)
        test_labels.append(lbl)

    return (np.array(train_images), np.array(train_labels)), (np.array(test_images), np.array(test_labels))


In [3]:
# Load the binary image dataset
(x_train, y_train), (x_test, y_test) = load_cats_vs_dogs_dataset()

Downloading and preparing dataset 786.67 MiB (download: 786.67 MiB, generated: 1.04 GiB, total: 1.81 GiB) to /root/tensorflow_datasets/cats_vs_dogs/4.0.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/23262 [00:00<?, ? examples/s]



Shuffling /root/tensorflow_datasets/cats_vs_dogs/incomplete.HNB29R_4.0.1/cats_vs_dogs-train.tfrecord*...:   0%…

Dataset cats_vs_dogs downloaded and prepared to /root/tensorflow_datasets/cats_vs_dogs/4.0.1. Subsequent calls will reuse this data.


In [4]:
# Introduce bias by reducing the number of cat samples (class 0) in the training set
cat_indices = np.where(y_train == 0)[0]
dog_indices = np.where(y_train == 1)[0]

In [5]:
# Keep only 10% of the cat samples
cat_indices_biased = np.random.choice(cat_indices, size=int(len(cat_indices) * 0.1), replace=False)
biased_indices = np.concatenate([cat_indices_biased, dog_indices])

x_train_biased = x_train[biased_indices]
y_train_biased = y_train[biased_indices]


In [6]:
# Combine images and labels
all_images = np.concatenate([x_train_biased, x_test])
all_labels = np.concatenate([y_train_biased, y_test])

In [7]:
# Create data generators for training and validation
train_datagen = ImageDataGenerator(rescale=1./255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True)
validation_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow(x_train_biased, y_train_biased, batch_size=32, shuffle=True)
validation_generator = validation_datagen.flow(all_images, all_labels, batch_size=32)

In [8]:
# Build a simple neural network model for binary classification
model = Sequential([
    Flatten(input_shape=(28, 28, 1)),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [9]:
# Train the model
model.fit(train_generator, epochs=5, validation_data=validation_generator)

# Evaluate the model
test_loss, test_acc = model.evaluate(all_images, all_labels)
print(f'Test accuracy: {test_acc}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.7829431295394897


In [10]:
# Get predictions
predictions = model.predict(all_images)
pred_labels = (predictions > 0.5).astype(int).flatten()



In [11]:
# Generate classification report
report = classification_report(all_labels, pred_labels, target_names=['cat', 'dog'])

print("Classification report:")
print(report)

Classification report:
              precision    recall  f1-score   support

         cat       0.00      0.00      0.00      3217
         dog       0.78      1.00      0.88     11604

    accuracy                           0.78     14821
   macro avg       0.39      0.50      0.44     14821
weighted avg       0.61      0.78      0.69     14821



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Unbiasing using SMOTE Technique**

Reference link: https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

In [13]:
# Install the imbalanced-learn library
!pip install -q -U imbalanced-learn

from imblearn.over_sampling import SMOTE

In [14]:
# Flatten the images to apply SMOTE
n_samples, height, width, channels = x_train_biased.shape
x_train_biased_flat = x_train_biased.reshape(n_samples, height * width * channels)

# Apply SMOTE to generate synthetic samples for the minority class
smote = SMOTE(random_state=42)
x_train_resampled_flat, y_train_resampled = smote.fit_resample(x_train_biased_flat, y_train_biased)

In [15]:
# Reshape the flattened images back to the original shape
x_train_resampled = x_train_resampled_flat.reshape(-1, height, width, channels)

# Combine the resampled training data with the test data
all_images_resampled = np.concatenate([x_train_resampled, x_test])
all_labels_resampled = np.concatenate([y_train_resampled, y_test])

In [16]:
# Create data generators for training and validation with the balanced dataset
train_generator_resampled = train_datagen.flow(x_train_resampled, y_train_resampled, batch_size=32, shuffle=True)
validation_generator_resampled = validation_datagen.flow(all_images_resampled, all_labels_resampled, batch_size=32)

In [17]:
# Build a simple neural network model for binary classification
model_resampled = Sequential([
    Flatten(input_shape=(28, 28, 1)),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_resampled.compile(optimizer='adam',
                        loss='binary_crossentropy',
                        metrics=['accuracy'])

In [18]:
# Train the model with the balanced dataset
model_resampled.fit(train_generator_resampled, epochs=5, validation_data=validation_generator_resampled)

# Evaluate the model with the balanced dataset
test_loss_resampled, test_acc_resampled = model_resampled.evaluate(all_images_resampled, all_labels_resampled)
print(f'Test accuracy with resampled data: {test_acc_resampled}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy with resampled data: 0.5110313296318054


In [19]:
# Get predictions
predictions_resampled = model_resampled.predict(all_images_resampled)
pred_labels_resampled = (predictions_resampled > 0.5).astype(int).flatten()

# Generate classification report
report_resampled = classification_report(all_labels_resampled, pred_labels_resampled, target_names=['cat', 'dog'])

print("Classification report with resampled data:")
print(report_resampled)

Classification report with resampled data:
              precision    recall  f1-score   support

         cat       0.64      0.04      0.08     11512
         dog       0.51      0.98      0.67     11604

    accuracy                           0.51     23116
   macro avg       0.58      0.51      0.37     23116
weighted avg       0.57      0.51      0.37     23116

