In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight
from PIL import Image
import shutil
import keras
import tensorflow
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from tensorflow.keras import callbacks
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
dataset = pd.read_csv('/content/drive/MyDrive/image_data.csv')
dataset

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification
0,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
1,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
2,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
3,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
4,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X
...,...,...,...,...
7904,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7905,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7906,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X
7907,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X


In [6]:
def extract_info_from_path(row):
    if pd.isnull(row['Benign or Malignant']):
        row['Benign or Malignant'] = 'Benign' if 'benign' in row['path_to_image'].lower() else 'Malignant'

    if pd.isnull(row['Cancer Type']):
        if 'mucinous_carcinoma' in row['path_to_image'].lower():
            row['Cancer Type'] = 'Mucinous Carcinoma'
        elif 'ductal_carcinoma' in row['path_to_image'].lower():
            row['Cancer Type'] = 'Ductal Carcinoma'
        elif 'lobular_carcinoma' in row['path_to_image'].lower():
            row['Cancer Type'] = 'Lobular Carcinoma'
        elif 'papillary_carcinoma' in row['path_to_image'].lower():
            row['Cancer Type'] = 'Papillary Carcinoma'
        elif 'adenosis' in row['path_to_image'].lower():
            row['Cancer Type'] = 'Adenosis'
        elif 'tubular_adenoma' in row['path_to_image'].lower():
            row['Cancer Type'] = 'Tubular Adenoma'
        elif 'fibroadenoma' in row['path_to_image'].lower():
            row['Cancer Type'] = 'Fibroadenoma'
        elif 'phyllodes_tumor' in row['path_to_image'].lower():
            row['Cancer Type'] = 'Phyllodes Tumor'

    if pd.isnull(row['Magnification']):
        if '100x' in row['path_to_image'].lower():
            row['Magnification'] = '100X'
        elif '200x' in row['path_to_image'].lower():
            row['Magnification'] = '200X'
        elif '400x' in row['path_to_image'].lower():
            row['Magnification'] = '400X'
        elif '40x' in row['path_to_image'].lower():
            row['Magnification'] = '40X'

    return row

dataset = dataset.apply(extract_info_from_path, axis=1)

print(dataset.isnull().sum())

path_to_image          0
Benign or Malignant    0
Cancer Type            0
Magnification          0
dtype: int64


In [7]:
remove_paths = [
    "SOB_B_F-14-23060AB-100-013.png",
    "SOB_B_F-14-23060AB-40-009.png",
    "SOB_B_TA-14-15275-400-009.png",
    "SOB_B_TA-14-3411F-100-011.png",
    "SOB_B_F-14-23222AB-400-010.png",
    "SOB_M_LC-14-12204-40-035.png",
    "SOB_M_LC-14-12204-40-034.png",
    "SOB_M_DC-14-9461-100-004.png",
    "SOB_M_DC-14-9461-100-040.png",
    "SOB_M_LC-14-16196-40-002.png",
    "SOB_M_DC-14-20629-400-019.png",
    "SOB_M_DC-14-13993-100-042.png",
    "SOB_M_DC-14-15792-200-005.png",
    "SOB_M_PC-14-15704-100-026.png"
]

mask = dataset['path_to_image'].apply(lambda x: any(remove_str in x for remove_str in remove_paths))
df_filtered = dataset[~mask]

In [10]:
df_filtered

Unnamed: 0,path_to_image,Benign or Malignant,Cancer Type,Magnification,image_name
0,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X,SOB_B_A-14-22549AB-100-011.png
1,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X,SOB_B_A-14-22549AB-100-005.png
2,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X,SOB_B_A-14-22549AB-100-004.png
3,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X,SOB_B_A-14-22549AB-100-010.png
4,BreaKHis_v1/histology_slides/breast/benign/SOB...,Benign,Adenosis,100X,SOB_B_A-14-22549AB-100-006.png
...,...,...,...,...,...
7904,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X,SOB_M_LC-14-12204-200-028.png
7905,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X,SOB_M_LC-14-12204-200-029.png
7906,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X,SOB_M_LC-14-12204-200-006.png
7907,BreaKHis_v1/histology_slides/breast/malignant/...,Malignant,Lobular Carcinoma,200X,SOB_M_LC-14-12204-200-039.png


In [9]:
df_filtered['image_name'] = df_filtered['path_to_image'].apply(lambda x: x.split('/')[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['image_name'] = df_filtered['path_to_image'].apply(lambda x: x.split('/')[-1])


In [11]:
original_dataset_dir = '/content/drive/MyDrive/benigno_maligno'

In [12]:
base_dir = '/content/drive/MyDrive/benign_malign_undersampling'

In [13]:
# Step 1: Split the DataFrame into train, validation, and test sets (80-10-10)
train_df, temp_df = train_test_split(df_filtered, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [14]:
for subset in ['train', 'validation', 'test']:
    os.makedirs(os.path.join(base_dir, subset), exist_ok=True)

cancer_types = df_filtered['Benign or Malignant'].unique()
for subset in ['train', 'validation', 'test']:
    for tumor in cancer_types:
        os.makedirs(os.path.join(base_dir, subset, tumor), exist_ok=True)

In [15]:
class_counts = train_df['Benign or Malignant'].value_counts()
largest_class = class_counts.idxmax()
target_count = class_counts.iloc[1]

# Undersample the largest class
df_largest = train_df[train_df['Benign or Malignant'] == largest_class].sample(target_count)
df_others = train_df[train_df['Benign or Malignant'] != largest_class]

# Combine undersampled largest class with the other classes
df_balanced = pd.concat([df_largest, df_others])

# Copy selected images to the new directory
def copy_files(dataframe, target_dir):
    for _, row in dataframe.iterrows():
        src = os.path.join(original_dataset_dir, row['image_name'])
        dst = os.path.join(target_dir, row['Benign or Malignant'], row['image_name'])
        os.makedirs(os.path.dirname(dst), exist_ok=True)

        shutil.copy(src, dst)


train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'validation')
test_dir = os.path.join(base_dir, 'test')

copy_files(df_balanced, train_dir)
copy_files(val_df, val_dir)
copy_files(test_df, test_dir)

In [16]:
subsets = ['train', 'validation', 'test']
cancer_types = df_filtered['Benign or Malignant'].unique()
for subset in subsets:
    print(f"Subset: {subset.capitalize()}")
    for tumor in cancer_types:
        folder_path = os.path.join(base_dir, subset, tumor)
        num_images = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])

        print(f"  {tumor}: {num_images} images")
    print()

Subset: Train
  Benign: 1960 images
  Malignant: 1960 images

Subset: Validation
  Benign: 255 images
  Malignant: 534 images

Subset: Test
  Benign: 260 images
  Malignant: 530 images

