#IMPORT NECESSARY LIBRARIES

In [1]:
import os
import random
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import time


In [2]:
import tensorflow as tf
from mpl_toolkits.mplot3d import Axes3D
from keras.preprocessing.image import load_img, img_to_array
from keras.applications import VGG19, ResNet50V2, InceptionV3, Xception
from keras.models import Model, Sequential
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Flatten, Dense, Dropout

In [3]:
from google.colab.patches import cv2_imshow
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from skimage.metrics import structural_similarity as ssim
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

#DATA PREPARATION

##Define runtime

In [4]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl (7.0 kB)
Collecting jedi>=0.16 (from ipython->ipython-autotime)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, ipython-autotime
Successfully installed ipython-autotime-0.3.2 jedi-0.19.1
time: 526 µs (started: 2024-07-15 16:40:54 +00:00)


##Load Dataset

In [5]:
!gdown "1fHPO2Q9PyMHjDcyeAhjuuv8IjJ9OWoVd"

Downloading...
From (original): https://drive.google.com/uc?id=1fHPO2Q9PyMHjDcyeAhjuuv8IjJ9OWoVd
From (redirected): https://drive.google.com/uc?id=1fHPO2Q9PyMHjDcyeAhjuuv8IjJ9OWoVd&confirm=t&uuid=00e73794-7a09-4f13-9f9e-a2faca5cfb5c
To: /content/HAM10000.zip
100% 5.58G/5.58G [01:45<00:00, 52.7MB/s]
time: 1min 51s (started: 2024-07-15 16:40:54 +00:00)


In [6]:
!unzip HAM10000.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ham10000_images_part_2/ISIC_0029326.jpg  
  inflating: ham10000_images_part_2/ISIC_0029327.jpg  
  inflating: ham10000_images_part_2/ISIC_0029328.jpg  
  inflating: ham10000_images_part_2/ISIC_0029329.jpg  
  inflating: ham10000_images_part_2/ISIC_0029330.jpg  
  inflating: ham10000_images_part_2/ISIC_0029331.jpg  
  inflating: ham10000_images_part_2/ISIC_0029332.jpg  
  inflating: ham10000_images_part_2/ISIC_0029333.jpg  
  inflating: ham10000_images_part_2/ISIC_0029334.jpg  
  inflating: ham10000_images_part_2/ISIC_0029335.jpg  
  inflating: ham10000_images_part_2/ISIC_0029336.jpg  
  inflating: ham10000_images_part_2/ISIC_0029337.jpg  
  inflating: ham10000_images_part_2/ISIC_0029338.jpg  
  inflating: ham10000_images_part_2/ISIC_0029339.jpg  
  inflating: ham10000_images_part_2/ISIC_0029340.jpg  
  inflating: ham10000_images_part_2/ISIC_0029341.jpg  
  inflating: ham10000_images_part_2/ISIC_0029342.jpg  


##Visualize data

###Support functions

###Ploting data

#MODEL IMPLEMENTATION

In [7]:
metadata = pd.read_csv('/content/HAM10000_metadata.csv')
image_dir1 = '/content/HAM10000_images_part_1'
image_dir2 = '/content/HAM10000_images_part_2'

image_paths = [os.path.join(image_dir1, fname) for fname in os.listdir(image_dir1)]
image_paths += [os.path.join(image_dir2, fname) for fname in os.listdir(image_dir2)]

image_path_dict = {os.path.splitext(os.path.basename(p))[0]: p for p in image_paths}

metadata['image_path'] = metadata['image_id'].map(image_path_dict)

label_to_id = {label: idx for idx, label in enumerate(metadata['dx'].unique())}

time: 113 ms (started: 2024-07-15 16:48:41 +00:00)


In [8]:
def load_and_preprocess_image(image_path):
    try:
        image = Image.open(image_path)
        image = image.resize((224, 224))
        image = np.array(image) / 255.0
        return image
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return None

def data_generator(metadata, label_to_id, batch_size=512):
    num_samples = len(metadata)
    while True:
        for offset in range(0, num_samples, batch_size):
            batch_samples = metadata.iloc[offset:offset+batch_size]

            images = []
            labels = []
            filenames = []  # To store filenames
            with ThreadPoolExecutor() as executor:
                future_to_image = {executor.submit(load_and_preprocess_image, row['image_path']): row for _, row in batch_samples.iterrows()}
                for future in as_completed(future_to_image):
                    row = future_to_image[future]
                    image = future.result()
                    if image is not None:
                        images.append(image)
                        labels.append(label_to_id[row['dx']])
                        filenames.append(row['image_path'])  # Store filename

            yield np.array(images), np.array(labels), filenames  # Yield filenames as well

# Get a batch of data
data_gen = data_generator(metadata, label_to_id, batch_size=512)
images, labels, filenames = next(data_gen)

# Print labels for specific images
def print_labels_for_images(filenames, labels, images_to_check):
    for image_name in images_to_check:
        try:
            index = filenames.index(image_name)
            print(f"Image: {image_name}, Label: {labels[index]}")
        except ValueError:
            print(f"Image: {image_name} not found in the current batch.")

NameError: name 'ThreadPoolExecutor' is not defined

time: 736 ms (started: 2024-07-15 16:48:55 +00:00)


##Models Initialization

###Load pre-trained models

In [None]:
vgg19 = VGG19(weights='imagenet', include_top=False, input_shape=(240, 426, 3))
resnet50v2 = ResNet50V2(weights='imagenet', include_top=False, input_shape=(240, 426, 3))
inceptionv3 = InceptionV3(weights='imagenet', include_top=False, input_shape=(240, 426, 3))
xception = Xception(weights='imagenet', include_top=False, input_shape=(240, 426, 3))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50v2_weights_tf_dim_ordering_tf_kernels_notop.h5
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5
time: 10.8 s (started: 2024-04-07 16:16:34 +00:00)


###Define a custom CNN autoencoder model

In [None]:
from tensorflow.keras import regularizers
def custom_cnn_model():
    # Define the Convolutional Autoencoder architecture

    input_img = Input(shape=(240, 426, 3))

    x = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.001), name='Encoding_Conv2D_1')(input_img)
    x = MaxPooling2D((2, 2), padding='same', name='Encoding_MaxPooling2D_1')(x)

    x = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.001), name='Encoding_Conv2D_2')(x)
    x = MaxPooling2D((2, 2), padding='same', name='Encoding_MaxPooling2D_2')(x)

    x = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.001), name='Encoding_Conv2D_3')(x)
    x = MaxPooling2D((2, 2), padding='same', name='Encoding_MaxPooling2D_3')(x)

    x = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.001), name='Encoding_Conv2D_4')(x)
    encoded = MaxPooling2D((2, 2), padding='valid', name='Encoding_MaxPooling2D_4')(x)

    # This is the encoded representation of the input
    autoencoder = Model(input_img, encoded)
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

    return autoencoder

customCNN = custom_cnn_model()

time: 196 ms (started: 2024-04-07 16:26:21 +00:00)


In [None]:
customCNN.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 240, 426, 3)]     0         
                                                                 
 Encoding_Conv2D_1 (Conv2D)  (None, 240, 426, 64)      1792      
                                                                 
 Encoding_MaxPooling2D_1 (M  (None, 120, 213, 64)      0         
 axPooling2D)                                                    
                                                                 
 Encoding_Conv2D_2 (Conv2D)  (None, 120, 213, 128)     73856     
                                                                 
 Encoding_MaxPooling2D_2 (M  (None, 60, 107, 128)      0         
 axPooling2D)                                                    
                                                                 
 Encoding_Conv2D_3 (Conv2D)  (None, 60, 107, 256)      2951

## Set models' name

In [None]:
vgg19_name = 'vgg19'
resnet50v2_name = 'resnet50'
inceptionv3_name = 'inceptionv3'
xception_name = 'xception'
customCNN_name = 'customCNN'

time: 683 µs (started: 2024-04-07 16:16:45 +00:00)


##Model works

In [None]:
def extract_features(input_array, model_name):
    if model_name == vgg19_name:
        return vgg19.predict(input_array)

    elif model_name == resnet50v2_name:
        return resnet50v2.predict(input_array)

    elif model_name == inceptionv3_name:
        return inceptionv3.predict(input_array)

    elif model_name == xception_name:
        return xception.predict(input_array)

    elif model_name == customCNN_name:
        return customCNN.predict(input_array)

    else: return

time: 626 µs (started: 2024-04-07 16:16:45 +00:00)


In [None]:
def kNN_model(train_features_flat, n_neighbors):
    knn = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean')
    knn.fit(train_features_flat)

time: 436 µs (started: 2024-04-07 16:16:45 +00:00)


In [None]:
def tSNE_features(model_name, train_features_flat):
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(train_features_flat)
    plt.figure(figsize=(10, 7))
    plt.title(f't-SNE Visualization for {model_name}')
    plt.scatter(tsne_results[1:, 0], tsne_results[1:, 1], label='Training Images')
    plt.scatter(tsne_results[0, 0], tsne_results[0, 1], label='Input Image', c='red')
    plt.legend()
    plt.show()

time: 699 µs (started: 2024-04-07 16:16:45 +00:00)


#DEFINE FUNCTIONS

##Compute similarity

In [None]:
# cossine similarity
def compute_cosine_similarity(image1, image2):

    # Convert images to grayscale
    gray_image1 = cv2.cvtColor(image1, cv2.COLOR_RGB2GRAY)
    gray_image2 = cv2.cvtColor(image2, cv2.COLOR_RGB2GRAY)

    # Flatten images into 1D arrays
    vector1 = gray_image1.flatten().astype(float)
    vector2 = gray_image2.flatten().astype(float)

    vector1 = vector1.reshape(1, -1)
    vector2 = vector2.reshape(1, -1)
    # Compute cosine similarity
    cossim = cosine_similarity(vector1,vector2)

    return cossim[0][0]

time: 823 µs (started: 2024-04-07 16:17:19 +00:00)


In [None]:
# structural similarity
def compute_ssim(image1, image2):
    # Convert the images to grayscale
    image1_gray = cv2.cvtColor(image1, cv2.COLOR_RGB2GRAY)
    image2_gray = cv2.cvtColor(image2, cv2.COLOR_RGB2GRAY)

    # Compute SSIM between two images
    return ssim(image1_gray, image2_gray)

time: 612 µs (started: 2024-04-07 16:17:19 +00:00)


In [None]:
# histogram similarity
def compute_histogram_similarity(image1, image2):
    # Compute the color histograms of the two images
    hist1 = cv2.calcHist([image1], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist2 = cv2.calcHist([image2], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])

    # Normalize the histograms
    cv2.normalize(hist1, hist1)
    cv2.normalize(hist2, hist2)

    # Compute the correlation between the two histograms
    correlation = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)

    return correlation

time: 785 µs (started: 2024-04-07 16:17:19 +00:00)


In [None]:
def compute_all_score(input_image_path, similar_image_indexs):
  image1 = preprocessing_input_image(input_image_path)
  results = []
  for i in similar_image_indexs:
    image2 = images[i]
    cossim = compute_cosine_similarity(image1, image2)
    ssim = compute_ssim(image1, image2)
    hissim = compute_histogram_similarity(image1, image2)
    dic = {"filename":f"ws_{i}.jpg","cossim":cossim,"ssim":ssim,"hissim":hissim}
    results.append(dic)
  return results


time: 634 µs (started: 2024-04-07 16:17:19 +00:00)


## Preprocessing


In [None]:
def get_features_flatten(model_name,images):
  start_time = time.time()
  train_features = extract_features(images, model_name)
  end_time = time.time()
  print(f"Time taken for feature extraction: {end_time - start_time} seconds")
  print(f"Using {model_name}, we get {train_features.shape}")
  train_features_flat = train_features.reshape(train_features.shape[0], -1)
  return train_features_flat

time: 590 µs (started: 2024-04-07 16:17:19 +00:00)


In [None]:
def preprocessing_input_image(input_image_path):
  img = cv2.imread(input_image_path)
  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  img = cv2.resize(img, (426, 240))
  return img


time: 517 µs (started: 2024-04-07 16:17:19 +00:00)


##Visualization

In [None]:
def visualize_the_images(input_image_path, similar_image_indexs, similarity_scores):
  original_image = preprocessing_input_image(input_image_path)
  plt.figure(figsize=(16, 8))
  # Plot original image
  plt.subplot(2, 6, 1)
  plt.imshow(original_image)
  plt.title(f'Original Image\n{input_image_path}')
  plt.axis('off')

  # Plot similar images
  for i, idx in enumerate(similar_image_indexs):
        similar_image = images[idx]
        plt.subplot(2, 6, i + 7)
        plt.imshow(similar_image)
        plt.title(f'Cosine similarity: {similarity_scores[i]["cossim"]*100:.2f}%\nStructural similarity: {similarity_scores[i]["ssim"]*100:.2f}%\nHistogram similarity: {similarity_scores[i]["hissim"]*100:.2f}%\n{similarity_scores[i]["filename"]}')
        plt.axis('off')

  plt.tight_layout()
  plt.show()




time: 873 µs (started: 2024-04-07 16:17:19 +00:00)


In [None]:
def visualize_the_histogram(input_image_path, similar_image_indexs,similarity_scores):
  original_image = preprocessing_input_image(input_image_path)
  plt.figure(figsize=(45, 10))
  plt.subplot(2, 7, 1)
  plt.imshow(original_image)
  plt.title(f'Original Image\n{input_image_path}')
  plt.axis('off')
  plt.subplot(2, 7, 8)
  for i, color in enumerate(['r', 'g', 'b']):
        hist = cv2.calcHist([original_image], [i], None, [256], [0, 256])
        plt.plot(hist, color=color)
  plt.title('Original Image RGB Histogram')
  plt.xlabel('Pixel Intensity')
  plt.ylabel('Frequency')


  for i, idx in enumerate(similar_image_indexs):
    plt.subplot(2, 7,i+2)
    plt.title(f'Cosine similarity: {similarity_scores[i]["cossim"]*100:.2f}%\nStructural similarity: {similarity_scores[i]["ssim"]*100:.2f}%\nHistogram similarity: {similarity_scores[i]["hissim"]*100:.2f}%\n{similarity_scores[i]["filename"]}')
    plt.imshow(images[idx])
    plt.axis('off')
    plt.subplot(2, 7, i+9)
    plt.title(f'Similar Image {i} Histogram')
    for i, color in enumerate(['r', 'g', 'b']):
        hist = cv2.calcHist([images[idx]], [i], None, [256], [0, 256])
        plt.plot(hist, color=color)
  ######## TO DO ADD Ve histogram rieng
  plt.show()

time: 1.72 ms (started: 2024-04-07 16:17:19 +00:00)


In [None]:
def highlight_features(image1, image2):
    # Initialize the ORB detector
    orb = cv2.ORB_create()

    # Detect keypoints and compute descriptors
    kp1, des1 = orb.detectAndCompute(image1, None)
    kp2, des2 = orb.detectAndCompute(image2, None)

    # Create a BFMatcher object
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)

    # Match the descriptors
    matches = bf.match(des1, des2)

    # Sort the matches based on distance
    matches = sorted(matches, key=lambda x: x.distance)

    # Draw the first 50 matches
    result = cv2.drawMatches(image1, kp1, image2, kp2, matches[:50], None, flags=2)

    return result

time: 719 µs (started: 2024-04-07 16:17:19 +00:00)


In [None]:
def visualize_highlight(input_image_path, similar_image_indexs, similarity_scores):
  original_image = preprocessing_input_image(input_image_path)
  plt.figure(figsize=(25, 5))
  # Plot original image
  plt.subplot(2, 6, 1)
  plt.imshow(original_image)
  plt.title(f'Original Image\n{input_image_path}')
  plt.axis('off')

  # Plot similar images
  for i, idx in enumerate(similar_image_indexs):
        similar_image = images[idx]
        highlighted_image = highlight_features(original_image, similar_image)
        plt.subplot(2, 6, i + 7)
        plt.imshow(highlighted_image)
        plt.title(f'Cosine similarity: {similarity_scores[i]["cossim"]*100:.2f}%\nStructural similarity: {similarity_scores[i]["ssim"]*100:.2f}%\nHistogram similarity: {similarity_scores[i]["hissim"]*100:.2f}%\n{similarity_scores[i]["filename"]}')
        plt.axis('off')

  plt.tight_layout()
  plt.show()

time: 992 µs (started: 2024-04-07 16:17:19 +00:00)


##Input image

In [None]:
def image_similarity(input_image_path,model_name,all_features, n_neighbors):
  input_image = preprocessing_input_image(input_image_path)

  input_image = np.expand_dims(input_image, axis=0)
  input_features = extract_features(input_image, model_name)
  input_features_flat = input_features.reshape(input_features.shape[0], -1)
  # Train a k-NN model
  knn = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean')
  knn.fit(all_features)

  start_time = time.time()

  distances, indices = knn.kneighbors(input_features_flat, n_neighbors=6)
  end_time = time.time()

  time_taken = end_time - start_time
  print(f"Time taken for prediction: {time_taken} seconds")
  #silhouette = silhouette_score(all_features, input_features_flat)
  #davies_bouldin = davies_bouldin_score(all_features, input_features_flat)
  #calinski_harabasz = calinski_harabasz_score(all_features, input_features_flat)

  #print(f"Silhouette Score: {silhouette}")
  #print(f"Davies-Bouldin Index: {davies_bouldin}")
  #print(f"Calinski-Harabasz Index: {calinski_harabasz}")
  return indices[0]


time: 787 µs (started: 2024-04-07 16:17:19 +00:00)


#TESING MODELS

##Feature extraction

In [None]:
import pickle
def store_features(input_array):
    model_names = ['vgg19','resnet50','inceptionv3','xception','customCNN']
    features_dict = {}

    for model_name in model_names:
        features = get_features_flatten( model_name,input_array)
        if features is not None:
            with open(f"/content/drive/MyDrive/Pickle/{model_name}_all_features.pkl", 'wb') as f:
                pickle.dump(features, f)

store_features(images)

Time taken for feature extraction: 21.307188987731934 seconds
Using vgg19, we get (1209, 7, 13, 512)
Time taken for feature extraction: 11.092413902282715 seconds
Using resnet50, we get (1209, 8, 14, 2048)
Time taken for feature extraction: 6.2546234130859375 seconds
Using inceptionv3, we get (1209, 6, 11, 2048)
Time taken for feature extraction: 21.35185742378235 seconds
Using xception, we get (1209, 8, 14, 2048)
Time taken for feature extraction: 11.286916494369507 seconds
Using customCNN, we get (1209, 15, 27, 512)
time: 1min 41s (started: 2024-04-07 16:26:43 +00:00)


In [None]:
model_name = customCNN_name
number_neighbors = 9

features = get_features_flatten(model_name,images)


## wait for gpu

Time taken for feature extraction: 22.59525775909424 seconds
Using customCNN, we get (1209, 240, 432, 3)
time: 22.6 s (started: 2024-04-05 16:12:22 +00:00)


##Get test images

In [None]:
!gdown "1Aol-xFrXTZ0SlF4nFwsyWqHdhaHKdEqn"
!unzip test_data.zip
# Get all image file paths
test_dir = "/content/test_data"
all_images_test = [os.path.join(data_ws_dir, filename) for filename in os.listdir(data_ws_dir)]
test_images = []
for img_path in all_images_test:
        test_img = cv2.imread(img_path)
        test_img = cv2.cvtColor(test_img, cv2.COLOR_BGR2RGB)
        test_images.append(test_img)
test_images = np.array(test_images)


##Test the model