In [1]:
!pwd

/content


In [2]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content"

In [3]:
!kaggle datasets download -d surajghuwalewala/ham1000-segmentation-and-classification

Downloading ham1000-segmentation-and-classification.zip to /content
100% 2.59G/2.59G [00:34<00:00, 105MB/s] 
100% 2.59G/2.59G [00:34<00:00, 80.6MB/s]


In [None]:
!unzip /content/ham1000-segmentation-and-classification.zip

In [5]:
!pip install git+https://github.com/tensorflow/docs

Collecting git+https://github.com/tensorflow/docs
  Cloning https://github.com/tensorflow/docs to /tmp/pip-req-build-l1wocnxl
  Running command git clone -q https://github.com/tensorflow/docs /tmp/pip-req-build-l1wocnxl
Building wheels for collected packages: tensorflow-docs
  Building wheel for tensorflow-docs (setup.py) ... [?25l[?25hdone
  Created wheel for tensorflow-docs: filename=tensorflow_docs-0.0.0.dev0-py3-none-any.whl size=179891 sha256=0c85488ee9314161192a6b591c23723ed5a72a3be1382d0aabc271afe7f56f6b
  Stored in directory: /tmp/pip-ephem-wheel-cache-em29cexy/wheels/cc/c4/d8/5341e93b6376c5c929c49469fce21155eb69cef1a4da4ce32c
Successfully built tensorflow-docs
Installing collected packages: tensorflow-docs
Successfully installed tensorflow-docs-0.0.0.dev0


In [1]:
import numpy as np 
import pandas as pd
import tensorflow as tf

from PIL import Image

from tensorflow import keras
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
from matplotlib.pyplot import imshow

In [2]:
def undersample(df, count, rs=42):
  result=[]
  df_group = df.groupby('dx')
  for x in df['dx'].unique():
    group = df_group.get_group(x)
    num = int(group['dx'].value_counts())
    if num >= count:
      s=group.sample(count, axis=0, random_state=rs)
    else:
      s=group.sample(frac=1, axis=0, random_state=rs)
    result.append(s)
  return pd.concat(result, axis=0).reset_index(drop=True)

In [3]:
def oversample(df, count, rs=42):
  lst = [df]
  for class_index, group in df.groupby('dx'):
      lst.append(group.sample(count-len(group), replace=True, random_state=rs))
  df_new = pd.concat(lst)
  return df_new

In [4]:
def img_np_convert(df, image_path, h, w):
  df['image_id'] = image_path + df['image_id'] +'.jpg'
  df['image'] = df['image_id'].map(lambda x: np.asarray(Image.open(x).resize((h, w))).astype(np.float32))
  return df

In [5]:
def img_np_convert_scaled(df, image_path, h, w):
  df['image_id'] = image_path + df['image_id'] +'.jpg'
  df['image'] = df['image_id'].map(lambda x: (np.asarray(Image.open(x).resize((h, w)))/255).astype(np.float32))
  return df

In [6]:
def my_split(df, train_size, test_size, val_size, rs=42):
  valid_test_split = val_size / test_size

  df_train, df_test_val = train_test_split(df, test_size=1-train_size, shuffle=True, random_state=rs)
  df_val, df_test = train_test_split(df_test_val, test_size=valid_test_split, shuffle=True, random_state=rs)

  df_train.reset_index(inplace=True)
  return df_train, df_test, df_val

In [7]:
def weight_cal(df):
  class_weight={}
  labels = list(df['dx'].unique())
  labels.sort()
  count = df['dx'].value_counts()
  for idx in range(7):
    class_weight[idx] = count['nv']/count[labels[idx]]
  return class_weight, labels

In [8]:
def df_to_np1(df):
  image = np.asarray(df['image'].to_list()) 
  target_df = df['dx']
  target = pd.get_dummies(data=target_df, columns=['dx']).to_numpy()
  return image, target

In [9]:
def df_to_np2(df):
  df_feature = df[['image', 'age', 'dx_type', 'localization']]
  df_feature = pd.get_dummies(data=df_feature, columns=['dx_type', 'localization'])
  df_feature['image'] = df_feature['image'].map(lambda x : x.flatten())
  i_feature = np.asarray(df_feature['image'].tolist())
  c_feature = df_feature.iloc[:, 1:].to_numpy()
  features = np.concatenate((i_feature, c_feature), axis=1)

  target = df['dx'].to_numpy()
  return features, target

In [10]:
def image_augment(df, target, count, size, rs=42):
  df_group = df.groupby('dx')
  group = df_group.get_group(target)
  s=group.sample(count, axis=0, random_state=rs)

  datagen = ImageDataGenerator(
    rotation_range = 20,
    width_shift_range = 0.1,
    height_shift_range = 0.1,
    horizontal_flip = True,
    fill_mode='nearest')
  
  for index, row in s.iterrows():
    image = row['image'].reshape((1, ) + row['image'].shape)
    gen = datagen.flow(image, batch_size=size)
    input = row.to_list()
    for i in range(size):
      img = next(gen)
      input[-1] = img[0]
      df.loc[len(df.index)] = input
  return None

In [11]:
def prep_pipeline1(upper_size, h, w, aug_targets, aug_count, aug_size, rs=42):
  df_o = pd.read_csv('../content/drive/MyDrive/DSE_I2100/data/HAM10000_metadata.csv')
  image_path = r'../content/images/'
  df_u = undersample(df_o, upper_size)
  
  df = img_np_convert(df_u, image_path, h, w)

  df_train, df_test, df_val = my_split(df, 0.7, 0.2, 0.1, rs)

  for target in aug_targets:
    image_augment(df_train, target, aug_count, aug_size, rs)

  weight, labels = weight_cal(df_train)

  X_train, y_train = df_to_np1(df_train)
  X_test, y_test = df_to_np1(df_test)
  X_val, y_val = df_to_np1(df_val)
  return (X_train, y_train), (X_test, y_test), (X_val, y_val), weight, labels

In [12]:
def prep_pipeline2(upper_size, h, w, aug_targets, aug_count, aug_size, rs=42):
  df_o = pd.read_csv('../content/drive/MyDrive/DSE_I2100/data/HAM10000_metadata.csv')
  image_path = r'../content/images/'
  df_u = undersample(df_o, upper_size)

  df = img_np_convert_scaled(df_u, image_path, h, w)

  df_train, df_test, df_val = my_split(df, 0.7, 0.2, 0.1, rs)

  for target in aug_targets:
    image_augment(df_train, target, aug_count, aug_size, rs)

  weight, labels = weight_cal(df_train)

  X_train, y_train = df_to_np2(df_train)
  X_test, y_test = df_to_np2(df_test)
  X_val, y_val = df_to_np2(df_val)
  return (X_train, y_train), (X_test, y_test), (X_val, y_val), weight, labels

In [None]:
h = 240
w = 240
aug_targets = ['mel', 'bcc']
aug_count = 5
aug_size = 2

train_set, test_set, val_set, class_weight, labels = prep_pipeline1(3000, h, w, aug_targets, aug_count, aug_size)

(6310, 7)
converted
splitted
(4416, 9)
(4426, 9)
(4436, 9)
(4436, 9)


In [13]:
h = 90
w = 90
aug_targets = ['mel', 'bcc']
aug_count = 5
aug_size = 2

train_set, test_set, val_set, class_weight, labels = prep_pipeline2(3000, h, w, aug_targets, aug_count, aug_size)

In [16]:
train_set[0].shape

(4436, 24320)

array(['mel', 'bcc', 'nv', ..., 'bcc', 'bcc', 'bcc'], dtype=object)