In [2]:
#download the data using the below code into your local setup

import requests
from tqdm import tqdm

url = 'http://www.inf.ufpr.br/vri/databases/BreaKHis_v1.tar.gz'
file_path = 'BreaKHis_v1.tar.gz'

response = requests.get(url, stream=True)

if response.status_code == 200:


  total_size = int(response.headers.get('content-length', 0))
  progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)

  with open(file_path, 'wb') as file:
      for chunk in response.iter_content(chunk_size=1024):
          if chunk:
              file.write(chunk)
              progress_bar.update(len(chunk))

  progress_bar.close()
  print('File downloaded successfully.')
else:

  print('Failed to download the file.')


100%|██████████| 4.27G/4.27G [04:05<00:00, 17.4MB/s]

File downloaded successfully.





In [2]:
# get all the 400x png files into a folder
import tarfile
import os
import shutil
import io

# Define the tar file
tar_file_path = 'BreaKHis_v1.tar.gz'

# Define the directory to store the extracted files
output_dir = 'extracted_images_400x'
os.makedirs(output_dir, exist_ok=True)

# Open the tar file
tar = tarfile.open(tar_file_path)

# Iterate over each member in the tar file
for member in tar.getmembers():
    if '400X' in member.name and member.name.endswith('.png'):
        # Use extractfile() to get a file-like object for the file data
        file_data = tar.extractfile(member)

        if file_data is not None:
            # Get the base filename (without directories)
            base_filename = os.path.basename(member.name)
            # Define the target file path in the output_dir
            target_path = os.path.join(output_dir, base_filename)

            # Write the file data to the new file
            with open(target_path, 'wb') as f:
                shutil.copyfileobj(file_data, f)

# Close the tar file
tar.close()


In [3]:
!ls extracted_images_400x | head -n 20


SOB_B_A-14-22549AB-400-001.png
SOB_B_A-14-22549AB-400-002.png
SOB_B_A-14-22549AB-400-003.png
SOB_B_A-14-22549AB-400-004.png
SOB_B_A-14-22549AB-400-005.png
SOB_B_A-14-22549AB-400-006.png
SOB_B_A-14-22549AB-400-007.png
SOB_B_A-14-22549AB-400-008.png
SOB_B_A-14-22549AB-400-009.png
SOB_B_A-14-22549AB-400-010.png
SOB_B_A-14-22549AB-400-011.png
SOB_B_A-14-22549AB-400-012.png
SOB_B_A-14-22549AB-400-013.png
SOB_B_A-14-22549AB-400-014.png
SOB_B_A-14-22549AB-400-015.png
SOB_B_A-14-22549AB-400-016.png
SOB_B_A-14-22549AB-400-017.png
SOB_B_A-14-22549AB-400-018.png
SOB_B_A-14-22549AB-400-019.png
SOB_B_A-14-22549AB-400-020.png


In [4]:
#creating a dataframe that has the patient id, file path, type

import os
import pandas as pd

# Define the directory where the files are stored
directory = 'extracted_images_400x'

# Get the list of file names
filenames = os.listdir(directory)

# Create a DataFrame from the file names
df = pd.DataFrame(filenames, columns=['filename'])

# Extract the type (M or B), patient ID and image ID from the filename
df['type'] = df['filename'].str.split('_', expand=True)[1]
df['patient_ID'] = df['filename'].str.split('-', expand=True)[2]

# Display the DataFrame
df


Unnamed: 0,filename,type,patient_ID
0,SOB_B_F-14-25197-400-035.png,B,25197
1,SOB_M_MC-14-18842-400-008.png,M,18842
2,SOB_M_LC-14-12204-400-030.png,M,12204
3,SOB_B_TA-14-3411F-400-008.png,B,3411F
4,SOB_M_DC-14-18650-400-002.png,M,18650
...,...,...,...
1815,SOB_M_DC-14-16716-400-004.png,M,16716
1816,SOB_B_F-14-9133-400-018.png,B,9133
1817,SOB_M_LC-14-15570C-400-002.png,M,15570C
1818,SOB_M_DC-14-18650-400-001.png,M,18650


In [5]:
import numpy as np

# Get a list of unique patient IDs for each type
malignant_ids = df[df['type'] == 'M']['patient_ID'].unique()
benign_ids = df[df['type'] == 'B']['patient_ID'].unique()

# Function to split patient IDs into train, validation, and test
def split_ids(ids):
    np.random.shuffle(ids)
    train_size = int(len(ids) * 0.8)
    val_size = int(len(ids) * 0.1)
    train_ids = ids[:train_size]
    val_ids = ids[train_size:train_size + val_size]
    test_ids = ids[train_size + val_size:]
    return train_ids, val_ids, test_ids

# Split patient IDs for each type
train_m_ids, val_m_ids, test_m_ids = split_ids(malignant_ids)
train_b_ids, val_b_ids, test_b_ids = split_ids(benign_ids)

# Concatenate train, validation, and test patient IDs
train_patient_ids = np.concatenate([train_m_ids, train_b_ids])
val_patient_ids = np.concatenate([val_m_ids, val_b_ids])
test_patient_ids = np.concatenate([test_m_ids, test_b_ids])

# Get corresponding dataframes
train_df = df[df['patient_ID'].isin(train_patient_ids)]
val_df = df[df['patient_ID'].isin(val_patient_ids)]
test_df = df[df['patient_ID'].isin(test_patient_ids)]


In [6]:
print('file name is', 'train_df')
for column in train_df.columns:
    print(column, train_df[column].nunique())


print('file name is', 'val_df')
for column in val_df.columns:
    print(column, val_df[column].nunique())


print('file name is', 'test_df')
for column in test_df.columns:
    print(column, test_df[column].nunique())


file name is train_df
filename 1448
type 2
patient_ID 64
file name is val_df
filename 157
type 2
patient_ID 7
file name is test_df
filename 215
type 2
patient_ID 10


In [7]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

# Define image size and path
img_size = (700, 460)
batch_size = 32
directory = 'extracted_images_400x'

# Prepare data generators
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)  # assuming 80-20 split for train-val

train_generator = train_datagen.flow_from_dataframe(
    train_df,
    directory=directory,
    x_col='filename',
    y_col='type',
    target_size=img_size,
    class_mode='binary',
    batch_size=batch_size,
    subset='training')

val_generator = train_datagen.flow_from_dataframe(
    train_df,
    directory=directory,
    x_col='filename',
    y_col='type',
    target_size=img_size,
    class_mode='binary',
    batch_size=batch_size,
    subset='validation')

# Load pre-trained VGG16 model
# base_model = VGG16(input_shape=img_size + (3,), include_top=False, weights='imagenet')

# Freeze the pre-trained model
# for layer in base_model.layers:
#    layer.trainable = False

# Create a new model on top
# model = Sequential([
#    base_model,
#    Flatten(),
 #   Dense(1, activation='sigmoid')
#])

# Compile the model
# model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
# model.fit(train_generator, validation_data=val_generator, epochs=10)


Found 1159 validated image filenames belonging to 2 classes.
Found 289 validated image filenames belonging to 2 classes.


In [8]:
import tensorflow as tf
from functools import partial
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler


# Define the standard CNN model
n_filters = 12  # Base number of convolutional filters

def make_standard_classifier(n_outputs=1):
    Conv2D = partial(tf.keras.layers.Conv2D, padding='same', activation='relu')
    BatchNormalization = tf.keras.layers.BatchNormalization
    Flatten = tf.keras.layers.Flatten
    Dense = partial(tf.keras.layers.Dense, activation='relu')

    model = tf.keras.Sequential([
        Conv2D(filters=1*n_filters, kernel_size=5, strides=2),
        BatchNormalization(),

        Conv2D(filters=2*n_filters, kernel_size=5, strides=2),
        BatchNormalization(),

        Conv2D(filters=4*n_filters, kernel_size=3, strides=2),
        BatchNormalization(),

        Conv2D(filters=6*n_filters, kernel_size=3, strides=2),
        BatchNormalization(),

        Flatten(),
        Dense(512),
        Dense(n_outputs, activation='sigmoid')  # Sigmoid activation for binary classification
    ])
    return model


In [9]:

# Create the standard CNN model
standard_classifier = make_standard_classifier()

In [10]:

# Compile the model
standard_classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [11]:

# Define callbacks
early_stop = EarlyStopping(patience=3)
def lr_scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr * 0.1
lr_schedule = LearningRateScheduler(lr_scheduler)

# Train the model
standard_classifier.fit(train_generator, validation_data=val_generator, epochs=10, callbacks=[early_stop, lr_schedule])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x7e18c80f05e0>