# Setup

## load google driver

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## load the dataset

In [2]:
import os
import json
import cv2
import numpy as np

dataset_dir = '/content/drive/MyDrive/P-MI/PJ1/skin_lesion_dataset'


def load_dataset(data_dir):
    images = []
    lesion_class = []
    symmetry = []
    masks = []

    for subdir in os.listdir(data_dir):
        subdir_path = os.path.join(data_dir, subdir)

        # check the subfile
        if os.path.isdir(subdir_path):
            # load the images
            image_folder = os.path.join(subdir_path, subdir + '_Dermoscopic_Image')
            image_path = os.path.join(image_folder, os.listdir(image_folder)[0])
            image = cv2.imread(image_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, (224, 224))
            image = image / 255.0
            images.append(image)

            # load the label
            label_file = os.path.join(subdir_path, subdir + '_label.json')
            with open(label_file, 'r') as f:
                label_dict = json.load(f)

            # load the class information
            class_label = label_dict['Lesion Class']
            lesion_class.append(class_label)

            # load the Asymmetry information
            symmetry_label = label_dict['Asymmetry Label']
            symmetry.append(symmetry_label)

            # load the lesion mask
            mask_folder = os.path.join(subdir_path, subdir + '_lesion')
            mask_path = os.path.join(mask_folder, os.listdir(mask_folder)[0])
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
            mask = cv2.resize(mask, (224, 224))
            mask = mask / 255.0
            masks.append(mask)

    return np.array(images), np.array(lesion_class), np.array(symmetry), np.array(masks)

# load the dataset
images, lesion_class, symmetry, masks = load_dataset(dataset_dir)

print('Number of images:', len(images))
print('Number of lesion_class labels:', len(lesion_class))
print('Number of symmetry labels:', len(symmetry))
print('Number of masks:', len(masks))

Number of images: 150
Number of lesion_class labels: 150
Number of symmetry labels: 150
Number of masks: 150


## prepare training set and test set

In [3]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
train_images, test_images, train_masks, test_masks, train_classes, test_classes, train_symmetry, test_symmetry = train_test_split(
    images, masks, lesion_class, symmetry, test_size=0.2, random_state=42)

# Print the sizes of the training and testing sets
print("Training set size:")
print("- Images:", len(train_images))
print("- Masks:", len(train_masks))
print("- Lesion Classes:", len(train_classes))
print("- Symmetry Labels:", len(train_symmetry))

print("\nTesting set size:")
print("- Images:", len(test_images))
print("- Masks:", len(test_masks))
print("- Lesion Classes:", len(test_classes))
print("- Symmetry Labels:", len(test_symmetry))

Training set size:
- Images: 120
- Masks: 120
- Lesion Classes: 120
- Symmetry Labels: 120

Testing set size:
- Images: 30
- Masks: 30
- Lesion Classes: 30
- Symmetry Labels: 30


# features extraction

In [4]:
import cv2
import numpy as np

symmetry_mapping = {
    'Fully Symmetric': 0,
    'Symmetric in 1 axes': 1,
    'Fully Asymmetric': 2
}

def extract_features(image, mask, symmetry):
    # Get the lesion area
    image = cv2.convertScaleAbs(image)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    mask= cv2.convertScaleAbs(mask)
    lesion = cv2.bitwise_and(gray_image, gray_image, mask=mask)

    # Shape features
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    area = cv2.contourArea(contours[0])
    perimeter = cv2.arcLength(contours[0], True)

    # Edge features
    edges = cv2.Canny(lesion, 100, 200)
    edge_pixels = np.sum(edges) / 255

    # Texture features
    texture = cv2.Laplacian(lesion, cv2.CV_64F).var()

    # Symmetry features
    symmetry_feature = symmetry_mapping[symmetry]

    # Combine all features
    features = [area, perimeter, edge_pixels, texture] + [symmetry_feature]

    return features

# Extract features for each sample
train_features = []
for image, mask, symmetry in zip(train_images, train_masks, train_symmetry):
    features = extract_features(image, mask, symmetry)
    train_features.append(features)

test_features = []
for image, mask, symmetry in zip(test_images, test_masks, test_symmetry):
    features = extract_features(image, mask, symmetry)
    test_features.append(features)

train_features = np.array(train_features)
test_features = np.array(test_features)

lesion_class_mapping = {
    'Common Nevus': 0,
    'Atypical Nevus': 1,
    'Melanoma': 2
}
train_classes_encoded = [lesion_class_mapping[label] for label in train_classes]
test_classes_encoded = [lesion_class_mapping[label] for label in test_classes]


# Training SVM

In [6]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

lesion_class_mapping = {
    'Common Nevus': 0,
    'Atypical Nevus': 1,
    'Melanoma': 2
}
train_classes_encoded = [lesion_class_mapping[label] for label in train_classes]
test_classes_encoded = [lesion_class_mapping[label] for label in test_classes]

svm = SVC(kernel='rbf', C=1.0, random_state=42)
svm.fit(train_features, train_classes_encoded)

# test
predictions = svm.predict(test_features)
# evaluate
accuracy = accuracy_score(test_classes_encoded, predictions)
print("Accuracy:", accuracy)
cm = confusion_matrix(test_classes_encoded, predictions)
print("Confusion Matrix:")
print(cm)

# Random forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from joblib import dump


rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(train_features, train_classes_encoded)


best_rf = grid_search.best_estimator_

# 保存训练好的随机森林模型
dump(best_rf, '/content/drive/MyDrive/P-MI/PJ1/best_rf_model.joblib')

# test
predictions = best_rf.predict(test_features)

# evaluate
accuracy = accuracy_score(test_classes_encoded, predictions)
cm = confusion_matrix(test_classes_encoded, predictions)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(cm)



Accuracy: 0.7333333333333333
Confusion Matrix:
[[12  3  0]
 [ 4  7  0]
 [ 1  0  3]]


# test

## test data prepare

In [16]:
label_dir = '/content/drive/MyDrive/P-MI/PJ1/label'
test_data_dir = '/content/drive/MyDrive/P-MI/PJ1/test_dataset'
segmentation_mask_dir = '/content/drive/MyDrive/P-MI/PJ1/segmentation_mask'

In [15]:
print(os.listdir('/content/drive/MyDrive/P-MI/PJ1'))

['skin_lesion_dataset', 'segmentation_mask', 'label', 'test_dataset', 'MIA_PJ1_task3.ipynb', 'best_rf_model.joblib']


In [11]:
from joblib import load
best_rf = load('/content/drive/MyDrive/P-MI/PJ1/best_rf_model.joblib')

In [22]:
images = []
lesion_class = []
symmetry = []
masks = []

for subdir in os.listdir(test_data_dir):
    subdir_path = os.path.join(test_data_dir, subdir)

    # check the subfile
    if os.path.isdir(subdir_path):
        # load the images
        image_path = os.path.join(subdir_path, os.listdir(subdir_path)[0])
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (224, 224))
        image = image / 255.0
        images.append(image)


In [25]:
for subdir in os.listdir(test_data_dir):
    subdir_path = os.path.join(test_data_dir, subdir)

    if os.path.isdir(subdir_path):
        image_path = os.path.join(subdir_path, os.listdir(subdir_path)[0])
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (224, 224))
        image = image / 255.0

        mask_path = os.path.join(segmentation_mask_dir, subdir, os.listdir(os.path.join(segmentation_mask_dir, subdir))[0])
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        mask = cv2.resize(mask, (224, 224))
        mask = mask / 255.0

        label_file = os.path.join(label_dir, subdir, os.listdir(os.path.join(label_dir, subdir))[0])
        with open(label_file, 'r') as f:
            label_dict = json.load(f)

        symmetry_label = label_dict['Asymmetry Label']

        features = extract_features(image, mask, symmetry_label)

        prediction = best_rf.predict([features])[0]

        lesion_class_mapping_inv = {v: k for k, v in lesion_class_mapping.items()}
        predicted_class = lesion_class_mapping_inv[prediction]

        label_dict['Lesion Class'] = predicted_class

        # update label
        with open(label_file, 'w') as f:
            json.dump(label_dict, f)

print("Prediction completed. Results saved in the corresponding label files.")

Prediction completed. Results saved in the corresponding label files.


# txt output

In [4]:
import os
import json

label_dir = '/content/drive/MyDrive/P-MI/PJ1/label'
output_file = '/content/drive/MyDrive/P-MI/PJ1/classification_results.txt'

subdirs = sorted([subdir for subdir in os.listdir(label_dir) if os.path.isdir(os.path.join(label_dir, subdir))])

with open(output_file, 'w') as f:
    for subdir in subdirs:
        label_file = os.path.join(label_dir, subdir, os.listdir(os.path.join(label_dir, subdir))[0])

        with open(label_file, 'r') as lf:
            label_dict = json.load(lf)

        predicted_class = label_dict['Lesion Class']

        f.write(f"{subdir}: {predicted_class}\n")

print(f"Classification results saved to {output_file}.")

Classification results saved to /content/drive/MyDrive/P-MI/PJ1/classification_results.txt.
