# Installations

In [None]:
!pip install -q datasets opendatasets evaluate

# Imports

In [None]:
from datasets import load_dataset, DatasetDict, load_metric, Dataset
from transformers import AutoImageProcessor, AutoFeatureExtractor, AutoModelForImageClassification, TrainingArguments, Trainer, ViTForImageClassification, ViTFeatureExtractor, ViTImageProcessor, Swinv2Model
from torchvision import transforms
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from fastai.vision.all import *
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)
import evaluate
import datasets
import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import opendatasets as od
import PIL

# Reusable Functions

In [None]:
def convert_to_dataset(data):
    dataset = Dataset.from_pandas(data)
    return dataset

# Import dataset

In [None]:
data = od.download("https://www.kaggle.com/datasets/awsaf49/cbis-ddsm-breast-cancer-image-dataset")

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !cp -r '/content/cbis-ddsm-breast-cancer-image-dataset' /content/drive/MyDrive/Data/

# Data exploration

In [None]:
meta = pd.read_csv('/content/cbis-ddsm-breast-cancer-image-dataset/csv/meta.csv')
meta.info()

In [None]:
meta.head()

In [None]:
train_data = pd.read_csv('/content/cbis-ddsm-breast-cancer-image-dataset/csv/mass_case_description_train_set.csv')
train_data.info()

In [None]:
train_data.head()

In [None]:
test_data = pd.read_csv('/content/cbis-ddsm-breast-cancer-image-dataset/csv/mass_case_description_test_set.csv')
test_data.info()

In [None]:
test_data.head()

In [None]:
dicom_data = pd.read_csv('/content/cbis-ddsm-breast-cancer-image-dataset/csv/dicom_info.csv')
dicom_data.info()

In [None]:
dicom_data.head()

In [None]:
# check image types
dicom_data.SeriesDescription.unique()

In [None]:
# check image path
cropped_images = dicom_data[dicom_data.SeriesDescription=='cropped images'].image_path
cropped_images.head()

In [None]:
full_images = dicom_data[dicom_data.SeriesDescription=='full mammogram images'].image_path
full_images.head()

In [None]:
roi_images = dicom_data[dicom_data.SeriesDescription=='ROI mask images'].image_path
roi_images.head()

# Data Cleaning

Cleaning mass_train

In [None]:
# check for null values
train_data.isnull().sum()

In [None]:
train_data.head(2)

In [None]:
# fill in nulls using average
train_data['mass shape'] = train_data['mass shape'].bfill()
train_data['mass margins'] = train_data['mass margins'].bfill()
train_data.isnull().sum()

In [None]:
train_data.describe()

Cleaning mass_test

In [None]:
# check for null values
test_data.isnull().sum()

In [None]:
test_data['mass margins'] = test_data['mass margins'].bfill()
test_data.isnull().sum()

In [None]:
test_data.describe()

Update paths in dataset for easy finding

In [None]:
# set correct image path for image types
imdir = imdir = "/content/drive/MyDrive/Data/cbis-ddsm-breast-cancer-image-dataset/jpeg"

In [None]:
# change directory path of images
cropped_images = cropped_images.replace('CBIS-DDSM/jpeg', imdir, regex=True)
full_mammo = full_images.replace('CBIS-DDSM/jpeg', imdir, regex=True)
roi_img = roi_images.replace('CBIS-DDSM/jpeg', imdir, regex=True)

In [None]:
cropped_images.head()

In [None]:
# organize image paths
full_dict = dict()
cropped_dict = dict()
roi_dict = dict()

for dicom in full_mammo:
    key = dicom.split("/")[7]
    full_dict[key] = dicom
for dicom in cropped_images:
    key = dicom.split("/")[7]
    cropped_dict[key] = dicom
for dicom in roi_img:
    key = dicom.split("/")[7]
    roi_dict[key] = dicom

In [None]:
# fix image paths
def fix_image_path(data):
    # print(data)
    for index, img in enumerate(data.values):
        img_name = img[11].split("/")[2]
        # print(data.iloc[index,11])
        data.iloc[index,11] = full_dict[img_name]
        img_name = img[12].split("/")[2]
        data.iloc[index,12] = cropped_dict[img_name]
        img_name = img[13].split("/")[2]
        data.iloc[index,13] = roi_dict[img_name]

In [None]:
# apply to datasets
fix_image_path(train_data)
fix_image_path(test_data)

In [None]:
train_data.head()

In [None]:
import PIL

for file in cropped_images[0:10]:
    cropped_images_show = PIL.Image.open(file)
    gray_img= cropped_images_show.convert("L")
    plt.imshow(gray_img, cmap='gray')

In [None]:
for file in full_mammo[0:10]:
    full_mammogram_images_show = PIL.Image.open(file)
    gray_img= full_mammogram_images_show.convert("L")
    plt.imshow(gray_img, cmap='gray')

In [None]:
for file  in roi_img[0:10]:
    ROI_mask_images_show = PIL.Image.open(file)
    gray_img= ROI_mask_images_show.convert("L")
    plt.imshow(gray_img, cmap='gray')

Drop unnecessary columns

In [None]:
train_data_clean = train_data.drop(["patient_id","left or right breast","assessment","subtlety","abnormality type","abnormality id",], axis=1)
test_data_clean = test_data.drop(["patient_id","left or right breast","assessment","subtlety","abnormality type","abnormality id",], axis=1)

In [None]:
train_data_clean

In [None]:
dicom_data

In [None]:
labels = ["Benign","Malignant"]
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

id2label[1]

# Data visualization

In [None]:
train_data_clean.head()

In [None]:
train_data_clean.shape

In [None]:
test_data_clean.head()

In [None]:
test_data_clean.shape

In [None]:
value = dicom_data['SeriesDescription'].value_counts()
plt.figure(figsize=(8,6))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Breast Cancer image types', fontsize=14)
plt.show()

In [None]:
value = train_data_clean['pathology'].value_counts()
plt.figure(figsize=(8,6))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Breast Cancer Diagnosis Types', fontsize=14)
plt.show()

In [None]:
# replace all benign without callback to normal

train_data_clean['pathology'] = train_data_clean['pathology'].replace('BENIGN_WITHOUT_CALLBACK', 'BENIGN')
test_data_clean['pathology'] = test_data_clean['pathology'].replace('BENIGN_WITHOUT_CALLBACK', 'BENIGN')

In [None]:
value = train_data_clean['pathology'].value_counts()
plt.figure(figsize=(8,6))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Breast Cancer Diagnosis Types', fontsize=14)
plt.show()

In [None]:
value = train_data_clean['mass shape'].value_counts()
plt.figure(figsize=(8,6))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Mass Shapes', fontsize=14)
#plt.savefig('/kaggle/working/pathology_distributions_red.png')
plt.show()

In [None]:
plt.figure(figsize=(8,6))

sns.countplot(train_data_clean, x='mass shape', hue='pathology')
plt.title('Mass Shape vs Pathology', fontsize=14)
plt.xlabel('Mass Shape')
plt.ylabel('Count')
plt.legend()
# plt.savefig('/kaggle/working/density_pathology_red.png')
plt.show()

In [None]:
value = train_data_clean['mass margins'].value_counts()
plt.figure(figsize=(8,6))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Mass Margins', fontsize=14)
#plt.savefig('/kaggle/working/pathology_distributions_red.png')
plt.show()

In [None]:
plt.figure(figsize=(8,6))

sns.countplot(train_data_clean, x='mass margins', hue='pathology')
plt.title('Mass Margin vs Pathology', fontsize=14)
plt.xlabel('Mass Margin')
plt.ylabel('Count')
plt.legend()
# plt.savefig('/kaggle/working/density_pathology_red.png')
plt.show()

In [None]:
plt.figure(figsize=(8,6))

sns.countplot(train_data_clean, x='breast_density', hue='pathology')
plt.title('Breast Density vs Pathology\n\n1: fatty || 2: Scattered Fibroglandular Density\n3: Heterogenously Dense || 4: Extremely Dense',
          fontsize=14)
plt.xlabel('Density Grades')
plt.ylabel('Count')
plt.legend()
# plt.savefig('/kaggle/working/density_pathology_red.png')
plt.show()

In [None]:
# function to display images
def display_images(column, number):
    # create figure and axes
    number_to_visualize = number
    rows = 1
    cols = number_to_visualize
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5))

    # Loop through rows and display images
    for index, row in train_data_clean.head(number_to_visualize).iterrows():
        image_path = row[column]
        image = matplotlib.image.imread(image_path)
        ax = axes[index]
        ax.imshow(image, cmap='gray')
        ax.set_title(f"{row['pathology']}")
        ax.axis('off')
    plt.tight_layout()
    plt.show()

print('Full Mammograms:\n')
display_images('image file path', 5)
print('Cropped Mammograms:\n')
display_images('cropped image file path', 5)
print('ROI Mammograms:\n')
display_images('ROI mask file path', 5)

# Convert dataframe to Dataset

In [None]:
train_data_clean = train_data_clean.drop(['breast_density', 'image view', 'mass shape', 'mass margins'], axis=1)
test_data_clean = test_data_clean.drop(['breast_density', 'image view', 'mass shape', 'mass margins'], axis=1)

In [None]:
train_data_clean.head()

In [None]:
train_data_clean.to_csv('Dataset2-Train.csv', encoding='utf-8', index=False, header=False)

In [None]:
test_data_clean.to_csv('Dataset2-Test.csv', encoding='utf-8', index=False, header=False)

In [None]:
train = convert_to_dataset(train_data_clean)
test = convert_to_dataset(test_data_clean)

dataset_dict = datasets.DatasetDict({"train":train, "test":test})

In [None]:
img = Image.open(dataset_dict["test"]["cropped image file path"][0])
img

Drop unnecessary columns

In [None]:
dataset_dict

# Push to Hugging Face as Dataset 2

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
dataset_dict.push_to_hub("Nicole-M/Dataset2")