<a href="https://colab.research.google.com/github/MattWang-Portfolio/MattWang-Portfolio-data-processing-algorithm-Skin_Cancer/blob/main/skincancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Skin Cancer MNIST: HAM10000 https://www.kaggle.com/kmader/skin-cancer-mnist-ham10000



In [None]:
#keras.utils: 做one-hot encoding用
#sklearn.model_selection: 分割訓練集和測試集
#os: 用來建立檔案、刪除檔案
#PIL: (圖像處理庫)匯入圖像
#seed: 設定種子，使每次隨機產生的資料有相同結果。可將數字改成自己的學號(或其他數字)
import numpy as np
import pandas as pd
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import os
from PIL import Image
np.random.seed(409570391)

In [None]:
#7項皮膚疾病簡稱與全名
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}


In [None]:
pd.Categorical(lesion_type_dict).codes


In [None]:
# Download from Google Drive
!gdown --id 1orCwIp_avT4Yg9lCT8pjImwzqBg4_xiI
!unzip project03.zip

In [None]:
#讀取影像資料，28*28*3個像素值欄位(pixel0000-pixel2351) + 1個分類類別欄位label
load_img = pd.read_csv('project3_train.csv')

In [None]:
load_img.head()

In [None]:
load_img.shape

In [None]:
load_img.iloc[: , :-1].values


In [None]:
X_img , y_label = load_img.iloc[: , :-1].values , load_img.iloc[: , -1].values


In [None]:
#將串列轉成矩陣
X_img_train = np.asarray(X_img.tolist())

#將一維的數據，轉換成三維(長*寬*RGB三色)
X_img_train=X_img_train.reshape(X_img_train.shape[0],28,28,3)

In [None]:
#檢查學習資料的照片數量、尺寸大小、維度
print("train data:",'images:',X_img_train.shape," labels:",y_label.shape) 

In [None]:
#標準化: 同除255(因為image的數字是0~255)
#X_img_train_normalize = X_img_train.astype('float32') / 255.0

In [None]:
#使用np_utils.to_categorical()傳入各參數的label標籤欄位，再執行OneHot encoding (轉成0或1的組合)
#y_label_train_OneHot = np_utils.to_categorical(y_label)

In [1]:
#檢查標籤總共有多少種分類
#這裡是共8008筆資料，每筆是7個0或1的組合
#y_label_train_OneHot.shape

建立與訓練CNN模型


In [None]:
#匯入keras中的Sequential、layers模組(Dense、 Dropout、 Activation、 Flatten、Conv2D、 MaxPooling2D、 ZeroPadding2D)
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, ZeroPadding2D

In [None]:
# Design your CNN model\

#建立keras的Sequential模型
model_cnn = Sequential()

In [None]:
model_cnn.add(Conv2D(filters=32,kernel_size=(3,3),input_shape=(28,28,3),
                     activation='relu',padding='same'))

model_cnn.add(Conv2D(filters=32,kernel_size=(3,3),
                     activation='relu',padding='same'))
model_cnn.add(Conv2D(filters=32,kernel_size=(3,3),
                     activation='relu',padding='same'))

model_cnn.add(MaxPooling2D(pool_size=(2,2)))



model_cnn.add(Conv2D(filters=64,kernel_size=(3,3),
                     activation='relu',padding='same'))
model_cnn.add(Conv2D(filters=64,kernel_size=(3,3),
                     activation='relu',padding='same'))
model_cnn.add(MaxPooling2D(pool_size=(2,2)))


model_cnn.add(Conv2D(filters=128,kernel_size=(3,3),
                     activation='relu',padding='same'))
model_cnn.add(Conv2D(filters=128,kernel_size=(3,3),
                     activation='relu',padding='same'))
model_cnn.add(MaxPooling2D(pool_size=(2,2)))


model_cnn.add(Conv2D(filters=256,kernel_size=(3,3),
                     activation='relu',padding='same'))
model_cnn.add(Conv2D(filters=256,kernel_size=(3,3),
                     activation='relu',padding='same'))
model_cnn.add(MaxPooling2D(pool_size=(2,2)))



model_cnn.add(Flatten())
model_cnn.add(Dense(64,activation='relu'))
model_cnn.add(Dropout(0.3))
model_cnn.add(Dense(7,activation='softmax'))

In [None]:
print(model_cnn.summary())


In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss',patience=20, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss',factor=0.1,patience=10, verbose=1, min_lr=1e-6)

In [None]:
model_cnn.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])


In [None]:
epochs=150
batch_size=128

x_train, x_validate, y_train, y_validate = train_test_split(X_img_train_normalize, y_label_train_OneHot, test_size=0.2, shuffle=True)

#train_history=model_cnn.fit(x_train, y_train, batch_size=batch_size, validation_data=(x_validate,y_validate), epochs=epochs, verbose=1, callbacks=None)#callbacks=[early_stopping,reduce_lr]


In [None]:
import matplotlib.pyplot as plt
def show_train_history(train_acc,test_acc, yAxisName):
  plt.plot(train_history.history[train_acc])
  plt.plot(train_history.history[test_acc])
  plt.title('Train History')
  plt.ylabel(yAxisName)
  plt.xlabel('Epoch')
  plt.legend(['train','val'], loc='upper left')
  plt.show


In [None]:
show_train_history('accuracy','val_accuracy','Accuracy')


In [None]:
show_train_history('loss','val_loss','Loss')


In [None]:
scores = model_cnn.evaluate(x_validate, y_validate, verbose=0)
scores[1]

In [None]:
prediction = np.argmax(model_cnn.predict(x_validate), axis=-1)
pd.crosstab(np.argmax(y_validate, axis=-1), prediction, rownames=['label'], colnames=['predict'])

In [None]:
# 使用最後的模型進行測試資料預測
load_test_img = pd.read_csv('project3_test.csv')
img_test = load_test_img.values

In [None]:
x_test=img_test.reshape(img_test.shape[0],28,28,3)
x_test_normalize = x_test.astype('float32') / 255.0

In [None]:
df_submit = pd.DataFrame([], columns=['Id', 'Label'])
df_submit['Id'] = [f'{i:04d}' for i in range(len(x_test_normalize))]
df_submit['Label'] = np.argmax(model_cnn.predict(x_test_normalize), axis=-1)

In [None]:
df_submit.to_csv('submission_CNN7.csv', index=None)


In [None]:
x_img_train, x_img_test, y_label_train, y_label_test = train_test_split(X_img_train_normalize, y_label_train_OneHot, test_size=0.2, random_state=2)


In [None]:
x_img_train_normalize = x_img_train.astype('float32') / 255.0
x_img_test_normalize = x_img_test.astype('float32') / 255.0