Import thư viện

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


Load và chia dữ liệu cho quá trình huấn luyện

In [None]:
# Kích thước ảnh và batch size
IMG_SIZE = (224, 224)
BATCH_SIZE = 16

# Đường dẫn đến thư mục chứa dữ liệu
data_dir = '/content/drive/MyDrive/ISIC_2018/'
images_dir = os.path.join(data_dir, 'ISIC2018_Task3_Training_Input/ISIC2018_Task3_Training_Input')
labels_path = os.path.join(data_dir, 'ISIC2018_Task3_Training_GroundTruth.csv')


In [None]:
total_images = df.shape[0]
print(f"Tổng số ảnh: {total_images}")


Tổng số ảnh: 10015


In [None]:
# Đọc file labels.csv
df = pd.read_csv(labels_path)

# Xác định cột nhãn (label) từ các cột chẩn đoán
diagnosis_columns = df.columns[1:]  # Bỏ cột đầu tiên là 'image'

# Tạo cột 'label' chứa tên loại bệnh tương ứng
df['label'] = df[diagnosis_columns].idxmax(axis=1)

# Chia dữ liệu thành train (75%) và temp (25%)
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(df, test_size=0.25, stratify=df['label'], random_state=42)

# Chia temp thành validation (15%) và test (10%)
val_df, test_df = train_test_split(temp_df, test_size=0.4, stratify=temp_df['label'], random_state=42)


In [None]:
train_df['image'] = train_df['image'].apply(lambda x: x if x.endswith('.jpg') else x + '.jpg')
val_df['image'] = val_df['image'].apply(lambda x: x if x.endswith('.jpg') else x + '.jpg')
test_df['image'] = test_df['image'].apply(lambda x: x if x.endswith('.jpg') else x + '.jpg')


In [None]:
print(f"Tổng số ảnh trong tập huấn luyện (train): {len(train_df)}")
print(f"Tổng số ảnh trong tập xác thực (validation): {len(val_df)}")
print(f"Tổng số ảnh trong tập kiểm tra (test): {len(test_df)}")


Tổng số ảnh trong tập huấn luyện (train): 7511
Tổng số ảnh trong tập xác thực (validation): 1502
Tổng số ảnh trong tập kiểm tra (test): 1002


In [None]:
print(df.columns)


Index(['image', 'MEL', 'NV', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC', 'label'], dtype='object')


Xử lý mất cân bằng dữ liệu bằng trọng số weights

In [None]:
# Tính toán class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)

# Chuyển class_weights thành dictionary
class_weights_dict = dict(zip(np.unique(train_df['label']), class_weights))


Tăng cường dữ liệu

In [None]:
# Tăng cường dữ liệu cho tập huấn luyện
train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rotation_range=30,          #Xoay ảnh ngẫu nhiên 30 độ
    width_shift_range=0.1,      #Dịch ngang ảnh max 10% chiều rộng
    height_shift_range=0.1,     #Dịch dọc ảnh max 10% chiều cao
    shear_range=0.2,            #Xoay nghiêng 20%
    zoom_range=0.2,             #Phóng thu 20%
    horizontal_flip=True,
    fill_mode='nearest'
)

# Chỉ rescale cho tập validation và test
val_test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

# Tạo generator cho từng tập
train_generator = train_datagen.flow_from_dataframe(
    train_df,
    directory=images_dir,
    x_col='image',
    y_col='label',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

val_generator = val_test_datagen.flow_from_dataframe(
    val_df,
    directory=images_dir,
    x_col='image',
    y_col='label',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

test_generator = val_test_datagen.flow_from_dataframe(
    test_df,
    directory=images_dir,
    x_col='image',
    y_col='label',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)


Found 7511 validated image filenames belonging to 7 classes.
Found 1502 validated image filenames belonging to 7 classes.
Found 1002 validated image filenames belonging to 7 classes.


Xây dựng mô hình phân loại bằng cách fine-tune VGG16

In [None]:
# Số lượng lớp (số loại bệnh)
num_classes = df['label'].nunique()

# Tải mô hình VGG16 với trọng số từ ImageNet, không bao gồm các lớp fully connected ở trên cùng
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3))

# Đóng băng các lớp convolutional
for layer in base_model.layers:
    layer.trainable = False

# Thêm các lớp fully connected tùy chỉnh
x = base_model.output
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
predictions = Dense(num_classes, activation='softmax')(x)

# Tạo mô hình hoàn chỉnh
model = Model(inputs=base_model.input, outputs=predictions)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Huấn luyện mô hình và lưu checkpoint vào drive

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam

# Biên dịch mô hình
optimizer = Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Mount Google Drive (nếu chưa mount)
from google.colab import drive
drive.mount('/content/drive')

# Tạo thư mục nếu chưa tồn tại
import os
checkpoint_dir = '/content/drive/MyDrive/checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Định nghĩa các callback
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(
    os.path.join(checkpoint_dir, 'best_model.h5'),
    save_best_only=True
)

# Huấn luyện mô hình
history = model.fit(
    train_generator,
    epochs=50,
    validation_data=val_generator,
    class_weight=class_weights_dict,
    callbacks=[early_stop, model_checkpoint]
)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.2310 - loss: 2.8082



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2696s[0m 6s/step - accuracy: 0.2311 - loss: 2.8078 - val_accuracy: 0.5087 - val_loss: 1.7214
Epoch 2/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 351ms/step - accuracy: 0.3763 - loss: 2.3475



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 398ms/step - accuracy: 0.3764 - loss: 2.3474 - val_accuracy: 0.5366 - val_loss: 1.6623
Epoch 3/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338ms/step - accuracy: 0.4279 - loss: 2.0561



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 385ms/step - accuracy: 0.4279 - loss: 2.0561 - val_accuracy: 0.5965 - val_loss: 1.4550
Epoch 4/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 340ms/step - accuracy: 0.4573 - loss: 1.9937



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 377ms/step - accuracy: 0.4574 - loss: 1.9936 - val_accuracy: 0.6391 - val_loss: 1.3262
Epoch 5/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 343ms/step - accuracy: 0.4945 - loss: 1.8494



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 390ms/step - accuracy: 0.4945 - loss: 1.8493 - val_accuracy: 0.6571 - val_loss: 1.2711
Epoch 6/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 335ms/step - accuracy: 0.5234 - loss: 1.7521



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 371ms/step - accuracy: 0.5234 - loss: 1.7521 - val_accuracy: 0.6891 - val_loss: 1.1814
Epoch 7/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 371ms/step - accuracy: 0.5234 - loss: 1.7152 - val_accuracy: 0.6804 - val_loss: 1.2016
Epoch 8/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 367ms/step - accuracy: 0.5421 - loss: 1.6315 - val_accuracy: 0.6644 - val_loss: 1.2549
Epoch 9/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step - accuracy: 0.5666 - loss: 1.5163



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 380ms/step - accuracy: 0.5666 - loss: 1.5163 - val_accuracy: 0.6964 - val_loss: 1.1071
Epoch 10/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 339ms/step - accuracy: 0.5800 - loss: 1.4577



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 375ms/step - accuracy: 0.5801 - loss: 1.4577 - val_accuracy: 0.7011 - val_loss: 1.0846
Epoch 11/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338ms/step - accuracy: 0.6003 - loss: 1.4105



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 376ms/step - accuracy: 0.6003 - loss: 1.4104 - val_accuracy: 0.7197 - val_loss: 1.0164
Epoch 12/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 372ms/step - accuracy: 0.6189 - loss: 1.3222 - val_accuracy: 0.7091 - val_loss: 1.0591
Epoch 13/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 369ms/step - accuracy: 0.6103 - loss: 1.3332 - val_accuracy: 0.7190 - val_loss: 1.0438
Epoch 14/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336ms/step - accuracy: 0.6226 - loss: 1.3406



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 372ms/step - accuracy: 0.6226 - loss: 1.3405 - val_accuracy: 0.7317 - val_loss: 1.0043
Epoch 15/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step - accuracy: 0.6368 - loss: 1.2385



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 369ms/step - accuracy: 0.6369 - loss: 1.2384 - val_accuracy: 0.7277 - val_loss: 0.9870
Epoch 16/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step - accuracy: 0.6540 - loss: 1.2044



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 369ms/step - accuracy: 0.6540 - loss: 1.2043 - val_accuracy: 0.7437 - val_loss: 0.9298
Epoch 17/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338ms/step - accuracy: 0.6538 - loss: 1.1769



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 374ms/step - accuracy: 0.6538 - loss: 1.1769 - val_accuracy: 0.7417 - val_loss: 0.9112
Epoch 18/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 373ms/step - accuracy: 0.6661 - loss: 1.1703 - val_accuracy: 0.7497 - val_loss: 0.9177
Epoch 19/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step - accuracy: 0.6728 - loss: 1.0957



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 369ms/step - accuracy: 0.6728 - loss: 1.0957 - val_accuracy: 0.7530 - val_loss: 0.8887
Epoch 20/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338ms/step - accuracy: 0.6916 - loss: 1.0208



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 375ms/step - accuracy: 0.6916 - loss: 1.0208 - val_accuracy: 0.7557 - val_loss: 0.8734
Epoch 21/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338ms/step - accuracy: 0.6855 - loss: 1.0392



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 374ms/step - accuracy: 0.6855 - loss: 1.0392 - val_accuracy: 0.7676 - val_loss: 0.8501
Epoch 22/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step - accuracy: 0.7002 - loss: 1.0370



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 369ms/step - accuracy: 0.7002 - loss: 1.0369 - val_accuracy: 0.7683 - val_loss: 0.8136
Epoch 23/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338ms/step - accuracy: 0.7122 - loss: 0.9770



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 385ms/step - accuracy: 0.7122 - loss: 0.9771 - val_accuracy: 0.7736 - val_loss: 0.7927
Epoch 24/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 370ms/step - accuracy: 0.7150 - loss: 0.9666 - val_accuracy: 0.7710 - val_loss: 0.7968
Epoch 25/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 365ms/step - accuracy: 0.7139 - loss: 0.9396 - val_accuracy: 0.7823 - val_loss: 0.8178
Epoch 26/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 365ms/step - accuracy: 0.7305 - loss: 0.9182 - val_accuracy: 0.7770 - val_loss: 0.7986
Epoch 27/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 332ms/step - accuracy: 0.7314 - loss: 0.8895



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 367ms/step - accuracy: 0.7313 - loss: 0.8895 - val_accuracy: 0.7843 - val_loss: 0.7831
Epoch 28/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 337ms/step - accuracy: 0.7436 - loss: 0.8584



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 373ms/step - accuracy: 0.7435 - loss: 0.8585 - val_accuracy: 0.7770 - val_loss: 0.7772
Epoch 29/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336ms/step - accuracy: 0.7321 - loss: 0.9000



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 382ms/step - accuracy: 0.7321 - loss: 0.9000 - val_accuracy: 0.7756 - val_loss: 0.7768
Epoch 30/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step - accuracy: 0.7399 - loss: 0.8728



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 367ms/step - accuracy: 0.7400 - loss: 0.8727 - val_accuracy: 0.7736 - val_loss: 0.7533
Epoch 31/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 365ms/step - accuracy: 0.7459 - loss: 0.8141 - val_accuracy: 0.7750 - val_loss: 0.7700
Epoch 32/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 332ms/step - accuracy: 0.7567 - loss: 0.7742



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 368ms/step - accuracy: 0.7567 - loss: 0.7742 - val_accuracy: 0.7816 - val_loss: 0.7290
Epoch 33/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 365ms/step - accuracy: 0.7613 - loss: 0.7607 - val_accuracy: 0.7790 - val_loss: 0.7414
Epoch 34/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 365ms/step - accuracy: 0.7549 - loss: 0.7601 - val_accuracy: 0.7810 - val_loss: 0.7329
Epoch 35/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 366ms/step - accuracy: 0.7662 - loss: 0.7603 - val_accuracy: 0.7776 - val_loss: 0.7293
Epoch 36/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 370ms/step - accuracy: 0.7698 - loss: 0.7345 - val_accuracy: 0.7909 - val_loss: 0.7310
Epoch 37/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 341ms/step - accuracy:



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 378ms/step - accuracy: 0.7787 - loss: 0.7448 - val_accuracy: 0.7816 - val_loss: 0.7210
Epoch 38/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 340ms/step - accuracy: 0.7745 - loss: 0.7207



[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 376ms/step - accuracy: 0.7745 - loss: 0.7207 - val_accuracy: 0.7843 - val_loss: 0.6980
Epoch 39/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 383ms/step - accuracy: 0.7787 - loss: 0.6931 - val_accuracy: 0.7870 - val_loss: 0.7262
Epoch 40/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 368ms/step - accuracy: 0.7708 - loss: 0.7008 - val_accuracy: 0.7916 - val_loss: 0.7029
Epoch 41/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 369ms/step - accuracy: 0.7721 - loss: 0.7072 - val_accuracy: 0.7896 - val_loss: 0.7155
Epoch 42/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 368ms/step - accuracy: 0.7858 - loss: 0.6698 - val_accuracy: 0.7916 - val_loss: 0.7030
Epoch 43/50
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 368ms/step - accurac

In [None]:
# Đánh giá mô hình trên tập test
test_loss, test_accuracy = model.evaluate(test_generator)
print(f'Loss trên tập test: {test_loss}')
print(f'Độ chính xác trên tập test: {test_accuracy}')


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 5s/step - accuracy: 0.7934 - loss: 0.7469
Loss trên tập test: 0.6680651903152466
Độ chính xác trên tập test: 0.7984032034873962
