# Installations

In [None]:
!pip install -q datasets

# Imports

In [None]:
from datasets import load_dataset, DatasetDict, load_metric, Dataset
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import datasets
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch

# Reusable Functions

In [None]:
def convert_to_dataset(data):
    dataset = Dataset.from_pandas(data)
    return dataset

# Import dataset

In [None]:
dataset = load_dataset("hongrui/mammogram_v_1", split="train[:100%]")

In [None]:
# dataset.save_ to_disk('/content/drive/MyDrive/Pretrained-models/dataset1')

In [None]:
dataset

# Data exploration

In [None]:
dataset.features

### Images without Implants without Cancer

In [None]:
example = dataset["image"][5]
example

In [None]:
example = dataset["image"][15]
example

### Images without Implants with Cancer

In [None]:
example = dataset["image"][5414]
example

In [None]:
example = dataset["image"][221]
example

In [None]:
example = dataset["image"][44]
example

### Image with Implants without Cancer

In [None]:
dataset["image"][171]

### Images with Implants with Cancer

In [None]:
dataset["image"][6723]

In [None]:
dataset["image"][8721]

### Convert DatasetDict to Dataframe

In [None]:
df = pd.DataFrame(dataset)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

# Data Cleaning

In [None]:
df.isnull().sum()

Drop columns we don't need

In [None]:
df = df.drop(['patient_id', 'laterality', 'age', 'BIRADS', 'text'], axis=1)

# Data Visualization

In [None]:
value = df['cancer'].value_counts()
plt.figure(figsize=(8,6))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Breast Cancer Diagnosis Types', fontsize=14)
#plt.savefig('/kaggle/working/pathology_distributions_red.png')
plt.show()

In [None]:
value = df['text2'].value_counts()
plt.figure(figsize=(8,6))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Diagnosis with implant or without', fontsize=14)
#plt.savefig('/kaggle/working/pathology_distributions_red.png')
plt.show()

In [None]:
value = df['view'].value_counts()
plt.figure(figsize=(8,6))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Image Views', fontsize=14)
#plt.savefig('/kaggle/working/pathology_distributions_red.png')
plt.show()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
df.groupby('density').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

# Drop unnecessary columns

In [None]:
# remove images with implants
df = df[df['implant'] == 0]

In [None]:
df = df.drop(['invasive', 'view', 'implant', 'density', 'text2'], axis=1)

# Data Upsampling

In [None]:
df_malignant = df[df['cancer'] == 1]
df_benign = df[df['cancer'] == 0]

In [None]:
df_malignant.shape

In [None]:
df_benign.shape

In [None]:
# make sure split is even for train and test data
train_benign = df_benign.sample(frac=0.7, random_state=42)
test_benign = df_benign.drop(train_benign.index)

In [None]:
train_benign.shape

In [None]:
test_benign.shape

In [None]:
train_malignant = df_malignant.sample(frac=0.7, random_state=42)
test_malignant = df_malignant.drop(train_malignant.index)

In [None]:
train_malignant.shape

In [None]:
test_malignant.shape

In [None]:
train_malignant = resample(train_malignant, replace=True, n_samples=(len(train_malignant)*10), random_state=42)
train_malignant.shape

In [None]:
test_malignant = resample(test_malignant, replace=True, n_samples=(len(test_malignant)*10), random_state=42)
test_malignant.shape

# Concatenate Upsampled data

In [None]:
train_dataset = pd.concat([train_benign, train_malignant])
test_dataset = pd.concat([test_benign, test_malignant])

In [None]:
value = train_dataset['cancer'].value_counts()
plt.figure(figsize=(8,6))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Breast Cancer Diagnosis Types in Train data', fontsize=14)
#plt.savefig('/kaggle/working/pathology_distributions_red.png')
plt.show()

In [None]:
value = test_dataset['cancer'].value_counts()
plt.figure(figsize=(8,6))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Breast Cancer Diagnosis Types in test data', fontsize=14)
#plt.savefig('/kaggle/working/pathology_distributions_red.png')
plt.show()

# Train-Test Split

In [None]:
train_dataset = train_dataset.sample(frac=0.8,random_state=200)
val_dataset =df.drop(train_dataset.index)

In [None]:
train_dataset.shape

In [None]:
val_dataset.shape

In [None]:
test_dataset.shape

In [None]:
train_dictionary = train_dataset.to_dict(orient='records')
val_dictionary = val_dataset.to_dict(orient='records')
test_dictionary = test_dataset.to_dict(orient='records')

In [None]:
train_data = datasets.Dataset.from_list(train_dictionary)
train_data

In [None]:
val_data = datasets.Dataset.from_list(val_dictionary)
val_data

In [None]:
test_data = datasets.Dataset.from_list(test_dictionary)
test_data

In [None]:
complete_dataset = datasets.DatasetDict({"train":train_data, "validate":val_data, "test":test_data})
complete_dataset

# Push to Hugging Face as Dataset 1

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
complete_dataset.push_to_hub("Nicole-M/Dataset1")