# 1. Dependency

In [3]:
import pandas as pd
import os
import cv2
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# 2. Prepare data

## 2.1. Load data

In [4]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p ~/.kaggle
!cp "/content/drive/MyDrive/skin-cancer-detection-cnn/kaggle.json" ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000

Dataset URL: https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000
License(s): CC-BY-NC-SA-4.0
Downloading skin-cancer-mnist-ham10000.zip to /content
 99% 5.16G/5.20G [00:58<00:00, 113MB/s]
100% 5.20G/5.20G [00:58<00:00, 95.3MB/s]


In [4]:
!unzip -q skin-cancer-mnist-ham10000.zip -d ham10000


In [5]:
!ls ham10000

ham10000_images_part_1	HAM10000_images_part_2	hmnist_28_28_RGB.csv
HAM10000_images_part_1	HAM10000_metadata.csv	hmnist_8_8_L.csv
ham10000_images_part_2	hmnist_28_28_L.csv	hmnist_8_8_RGB.csv


In [5]:
df = pd.read_csv('ham10000/HAM10000_metadata.csv')
labels = df['dx'].unique()
for label in labels:
  print(label)

bkl
nv
df
mel
vasc
bcc
akiec


In [6]:
benign_labels = ['bkl', 'nv', 'df', 'vasc']
data = df[df['dx'].isin(['mel']+benign_labels)].copy()
print(data)

         lesion_id      image_id   dx    dx_type   age     sex localization
0      HAM_0000118  ISIC_0027419  bkl      histo  80.0    male        scalp
1      HAM_0000118  ISIC_0025030  bkl      histo  80.0    male        scalp
2      HAM_0002730  ISIC_0026769  bkl      histo  80.0    male        scalp
3      HAM_0002730  ISIC_0025661  bkl      histo  80.0    male        scalp
4      HAM_0001466  ISIC_0031633  bkl      histo  75.0    male          ear
...            ...           ...  ...        ...   ...     ...          ...
9683   HAM_0000102  ISIC_0031547   nv  consensus  20.0    male         back
9684   HAM_0000102  ISIC_0032221   nv  consensus  20.0    male         back
9685   HAM_0005314  ISIC_0030693   nv  consensus  40.0    male         neck
9686   HAM_0003322  ISIC_0031649   nv  consensus  50.0  female         face
10014  HAM_0003521  ISIC_0032258  mel      histo  70.0  female         back

[9174 rows x 7 columns]


In [7]:
data['label'] = data['dx'].apply(lambda x: 1 if x == 'mel' else 0)
print(data)

         lesion_id      image_id   dx    dx_type   age     sex localization  \
0      HAM_0000118  ISIC_0027419  bkl      histo  80.0    male        scalp   
1      HAM_0000118  ISIC_0025030  bkl      histo  80.0    male        scalp   
2      HAM_0002730  ISIC_0026769  bkl      histo  80.0    male        scalp   
3      HAM_0002730  ISIC_0025661  bkl      histo  80.0    male        scalp   
4      HAM_0001466  ISIC_0031633  bkl      histo  75.0    male          ear   
...            ...           ...  ...        ...   ...     ...          ...   
9683   HAM_0000102  ISIC_0031547   nv  consensus  20.0    male         back   
9684   HAM_0000102  ISIC_0032221   nv  consensus  20.0    male         back   
9685   HAM_0005314  ISIC_0030693   nv  consensus  40.0    male         neck   
9686   HAM_0003322  ISIC_0031649   nv  consensus  50.0  female         face   
10014  HAM_0003521  ISIC_0032258  mel      histo  70.0  female         back   

       label  
0          0  
1          0  
2     

In [8]:
image_dir_1 = "/content/ham10000/HAM10000_images_part_1"
image_dir_2 = "/content/ham10000/HAM10000_images_part_2"


all_image_paths = {img_name: os.path.join(image_dir_1, img_name)
                   for img_name in os.listdir(image_dir_1)}
all_image_paths.update({img_name: os.path.join(image_dir_2, img_name)
                        for img_name in os.listdir(image_dir_2)})

data['image_path'] = data['image_id'].apply(lambda x: all_image_paths.get(f"{x}.jpg"))

In [9]:
print(data)

         lesion_id      image_id   dx    dx_type   age     sex localization  \
0      HAM_0000118  ISIC_0027419  bkl      histo  80.0    male        scalp   
1      HAM_0000118  ISIC_0025030  bkl      histo  80.0    male        scalp   
2      HAM_0002730  ISIC_0026769  bkl      histo  80.0    male        scalp   
3      HAM_0002730  ISIC_0025661  bkl      histo  80.0    male        scalp   
4      HAM_0001466  ISIC_0031633  bkl      histo  75.0    male          ear   
...            ...           ...  ...        ...   ...     ...          ...   
9683   HAM_0000102  ISIC_0031547   nv  consensus  20.0    male         back   
9684   HAM_0000102  ISIC_0032221   nv  consensus  20.0    male         back   
9685   HAM_0005314  ISIC_0030693   nv  consensus  40.0    male         neck   
9686   HAM_0003322  ISIC_0031649   nv  consensus  50.0  female         face   
10014  HAM_0003521  ISIC_0032258  mel      histo  70.0  female         back   

       label                                       

In [10]:
print(data['label'].value_counts())

label
0    8061
1    1113
Name: count, dtype: int64


The dataset is imbalanced, with significantly fewer melanoma samples, so data augmentation is applied to increase the number of melanoma images.

## 2.2. Preprocessing data

In [11]:
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode="nearest"
)

In [12]:
IMG_SIZE = 128

image_list, label_list = [], []

for _, row in tqdm(data.iterrows(), total=len(data)):
  img_path = row['image_path']
  label = row['label']

  img = cv2.imread(img_path)
  if img is None:
    continue

  img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
  img = cv2.resize(img,(IMG_SIZE, IMG_SIZE))
  img = img/255.0

  image_list.append(img)
  label_list.append(label)

  if label == 1:
    img_expanded = np.expand_dims(img, axis =0)
    aug_iter = img_gen.flow(img_expanded, batch_size=1)

    for _ in range(6):
      aug_img = next(aug_iter)[0]
      image_list.append(aug_img)
      label_list.append(label)




100%|██████████| 9174/9174 [01:40<00:00, 90.95it/s] 


In [13]:
labels = np.array(label_list)
print("Number of 0 label", np.sum(labels==0))
print("Numbef of 1 label", np.sum(labels==1))

Number of 0 label 8061
Numbef of 1 label 7791


In [None]:
np.save('/content/drive/MyDrive/skin-cancer-detection-cnn/data/skin_cancer__aug_images.npy', image_list)
np.save('/content/drive/MyDrive/skin-cancer-detection-cnn/data/skin_cancer__aug_labels.npy', label_list)

KeyboardInterrupt: 