In [35]:
# Imports
from google.cloud import storage
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten

# Variables
CUR_DIR = os.getcwd()
DATA_DIR = os.path.join(CUR_DIR, "data")

RAW_DATA = os.path.join(DATA_DIR, "raw_data")
DATA_CLEAN = os.path.join(DATA_DIR, "data_clean")

BUCKET_NAME = "test_boobies"
METADATA = pd.read_excel(os.path.join(DATA_DIR, "metadata.xlsx"))

BATCH_SIZE = 64
EPOCHS = 5
DIM = 64

In [9]:
df = pd.read_csv("gs://mammo_data/ready_to_train.csv")
df.head()

Unnamed: 0,image_id,path,cancer
0,1000030932,gs://mammo_data/1000030932.jpg,0
1,1000049233,gs://mammo_data/1000049233.jpg,0
2,1000066573,gs://mammo_data/1000066573.jpg,0
3,1000069257,gs://mammo_data/1000069257.jpg,0
4,1000108611,gs://mammo_data/1000108611.jpg,0


In [22]:
df.cancer.value_counts(normalize= True)

cancer
0    0.978834
1    0.021166
Name: proportion, dtype: float64

In [10]:
split = .8

In [12]:
cancer = df[df.cancer == 1]
no_cancer = df[df.cancer == 0]
cancer.shape, no_cancer.shape

((1158, 3), (53552, 3))

In [20]:
cancer = df[df.cancer == 1].copy()
augmentation_cancer = cancer._append([cancer] * 20, ignore_index= True)
no_cancer = df[df.cancer == 0].copy()
no_cancer_50 = no_cancer.iloc[ : 25_000]
augmentation_cancer.shape, cancer.shape, no_cancer.shape, no_cancer_50.shape

((24318, 3), (1158, 3), (53552, 3), (25000, 3))

In [27]:
tester = cancer._append([cancer] * 17, ignore_index= True)
tester.shape

(20844, 3)

In [None]:
cancer_split = int(cancer.shape[0] * split)
no_cancer_split = int(no_cancer.shape[0] * split)
cancer_split, no_cancer_split

In [None]:
cancer_train = cancer.sample(frac= 1.0, replace= True, random_state= 4212).iloc[ : cancer_split]
cancer_test = cancer.sample(frac= 1.0, replace= True, random_state= 4212).iloc[cancer_split : ]
no_cancer_train = no_cancer.sample(frac= 1.0, replace= True, random_state= 4212).iloc[ : no_cancer_split]
no_cancer_test = no_cancer.sample(frac= 1.0, replace= True, random_state= 4212).iloc[no_cancer_split : ]

In [None]:
cancer_train.shape, cancer_test.shape, no_cancer_train.shape, no_cancer_test.shape

In [None]:
data_train = pd.concat([no_cancer_train, cancer_train])
data_test = pd.concat([no_cancer_test, cancer_test])
data_train.shape, data_test.shape

In [38]:

def custom_data_balancing(dataframe, data_type: str, split: float):

    print("Create Train test split : ")

    # Get labels
    cancer = dataframe[dataframe.cancer == 1].copy()
    no_cancer = dataframe[dataframe.cancer == 0].copy()

    if data_type == "ricard":
        cancer_50 = cancer._append([cancer] * 20, ignore_index=True)
        no_cancer_50 = no_cancer.iloc[:25_000]
        cancer = cancer_50
        no_cancer = no_cancer_50

    if data_type == "under_sample":
        no_cancer_under = no_cancer.iloc[:1158]
        no_cancer = no_cancer_under

    if data_type == "over_sample":
        cancer_over = cancer._append([cancer] * 45, ignore_index=True)
        cancer = cancer_over

    if data_type == "custom":
        cancer_custom = cancer._append([cancer] * 15, ignore_index=True)
        cancer = cancer_custom

    print(f"Shape Cancer : {cancer.shape}, No Cancer: {no_cancer.shape}")
    # Keep ratio
    cancer_split = int(cancer.shape[0] * split)
    no_cancer_split = int(no_cancer.shape[0] * split)

    # Split Data
    cancer_train = cancer.sample(frac=1.0, replace=False, random_state=4212).iloc[
        :cancer_split
    ]
    cancer_test = cancer.sample(frac=1.0, replace=False, random_state=4212).iloc[
        cancer_split:
    ]
    no_cancer_train = no_cancer.sample(frac=1.0, replace=False, random_state=4212).iloc[
        :no_cancer_split
    ]
    no_cancer_test = no_cancer.sample(frac=1.0, replace=False, random_state=4212).iloc[
        no_cancer_split:
    ]

    # Concat train & test
    data_train = pd.concat([no_cancer_train, cancer_train])
    data_test = pd.concat([no_cancer_test, cancer_test])

    print(data_train.shape, data_test.shape)

    return data_train, data_test

In [39]:

def create_dataset(input: str = "local", data_type: str = 'all', ratio=0.8):
    """
    Creates a dataset for model training.

    Parameters:
    - input: Specifies the source of the dataset, 'local' or 'cloud'.

    Returns:
    - TensorFlow dataset object.
    """
    # local or cloud
    # Load the dataset
    if input == "local":
        df = pd.read_csv("ready_to_train.csv")
    if input == "cloud":
        df = pd.read_csv("gs://mammo_data/ready_to_train.csv")

    # Train / Test keeping ratio
    data_train, data_test = custom_data_balancing(
        dataframe= df, data_type= data_type, split=ratio
    )

    # Create a TensorFlow dataset
    print("Create the train tensorflow dataset :")
    train_dataset = create_tensor_dataset(data_train)
    print("Create the test tensorflow dataset :")
    test_dataset = create_tensor_dataset(data_test)
    print("✅ Dataset created successfully.")
    return train_dataset, test_dataset



def create_tensor_dataset(dataframe):
    paths = dataframe["path"].values
    labels = dataframe["cancer"].values

    labels = tf.cast(labels, dtype=tf.int32)

    dataset = tf.data.Dataset.from_tensor_slices((paths, labels))
    dataset = dataset.map(load_and_process_image)

    return dataset

# Load and process images
def load_and_process_image(file_path: str, label):
    """
    Loads and processes an image file for model training.

    Parameters:
    - file_path: The path to the image file.
    - label: The label associated with the image file.

    Returns:
    - Tuple containing the processed image and its label.
    """

    img = tf.io.read_file(file_path)
    img = tf.io.decode_jpeg(img, channels=1)
    img = tf.image.resize(img, [DIM, DIM])  # Resize images
    img = img / 255.0  # Normalize to [0,1]
    return img, label

In [41]:
data_types = ["ricard",
"under_sample",
"over_sample",
"custom"]

for data_type in data_types:
    create_dataset(data_type= data_type)

Create Train test split : 
Shape Cancer : (24318, 3), No Cancer: (25000, 3)
(39454, 3) (9864, 3)
Create the train tensorflow dataset :
Create the test tensorflow dataset :
✅ Dataset created successfully.
Create Train test split : 
Shape Cancer : (1158, 3), No Cancer: (1158, 3)
(1852, 3) (464, 3)
Create the train tensorflow dataset :
Create the test tensorflow dataset :
✅ Dataset created successfully.
Create Train test split : 
Shape Cancer : (53268, 3), No Cancer: (53552, 3)
(85455, 3) (21365, 3)
Create the train tensorflow dataset :
Create the test tensorflow dataset :
✅ Dataset created successfully.
Create Train test split : 
Shape Cancer : (18528, 3), No Cancer: (53552, 3)
(57663, 3) (14417, 3)
Create the train tensorflow dataset :
Create the test tensorflow dataset :
✅ Dataset created successfully.
