In [1]:
# Install required libraries
!pip install kagglehub opencv-python-headless tensorflow

# Import necessary libraries
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import cv2

# Download the dataset using KaggleHub
import kagglehub

# Download dataset
path = kagglehub.dataset_download("quandang/vietnamese-foods")
print("Path to dataset files:", path)

# Set dataset paths
train_path = os.path.join(path, "train")
val_path = os.path.join(path, "validation")
test_path = os.path.join(path, "test")

# Load image paths and labels
def load_image_paths_and_labels(base_path):
    image_paths = []
    labels = []
    classes = sorted(os.listdir(base_path))  # Ensure consistent class order
    for label, class_name in enumerate(classes):
        class_path = os.path.join(base_path, class_name)
        for img_name in os.listdir(class_path):
            image_paths.append(os.path.join(class_path, img_name))
            labels.append(label)
    return image_paths, labels, classes

train_images, train_labels, class_names = load_image_paths_and_labels(train_path)
val_images, val_labels, _ = load_image_paths_and_labels(val_path)
test_images, test_labels, _ = load_image_paths_and_labels(test_path)

print(f"Number of training images: {len(train_images)}")
print(f"Number of validation images: {len(val_images)}")
print(f"Number of test images: {len(test_images)}")
print(f"Classes: {class_names}")

# Define image preprocessing function
IMG_SIZE = (224, 224)

def preprocess_image(image_path):
    img = load_img(image_path, target_size=IMG_SIZE)
    img_array = img_to_array(img) / 255.0
    return np.expand_dims(img_array, axis=0)

# Load pre-trained model for feature extraction
base_model = ResNet50(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
model = Model(inputs=base_model.input, outputs=tf.keras.layers.GlobalAveragePooling2D()(base_model.output))

# Extract features from images
def extract_features(image_paths, model):
    features = []
    for image_path in image_paths:
        img = preprocess_image(image_path)
        feature = model.predict(img, verbose=0)
        features.append(feature[0])
    return np.array(features)

# Extract features for train, validation, and test sets
print("Extracting features...")
train_features = extract_features(train_images, model)
val_features = extract_features(val_images, model)
test_features = extract_features(test_images, model)
print("Feature extraction complete.")

# Normalize the features for similarity calculation
train_features = normalize(train_features, axis=1)

# Image retrieval function
def retrieve_similar_images(query_image_path, model, train_features, train_images, top_k=5):
    query_feature = model.predict(preprocess_image(query_image_path))[0]
    query_feature = normalize(query_feature.reshape(1, -1), axis=1)
    similarities = cosine_similarity(query_feature, train_features)
    top_indices = np.argsort(similarities[0])[::-1][:top_k]
    return [train_images[i] for i in top_indices], similarities[0][top_indices]

# Visualization function
def visualize_results(query_image_path, similar_image_paths, similarities):
    plt.figure(figsize=(15, 5))

    # Query image
    plt.subplot(1, len(similar_image_paths) + 1, 1)
    query_img = cv2.cvtColor(cv2.imread(query_image_path), cv2.COLOR_BGR2RGB)
    plt.imshow(query_img)
    plt.title("Query Image")
    plt.axis("off")

    # Retrieved images
    for i, (img_path, sim) in enumerate(zip(similar_image_paths, similarities)):
        plt.subplot(1, len(similar_image_paths) + 1, i + 2)
        retrieved_img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
        plt.imshow(retrieved_img)
        plt.title(f"Sim: {sim:.2f}")
        plt.axis("off")
    plt.show()

# Test the image retrieval system
query_image = test_images[0]  # Use the first test image as query
similar_images, similarity_scores = retrieve_similar_images(query_image, model, train_features, train_images)

# Visualize results
visualize_results(query_image, similar_images, similarity_scores)


Downloading from https://www.kaggle.com/api/v1/datasets/download/quandang/vietnamese-foods?dataset_version_number=11...


100%|██████████| 4.17G/4.17G [00:41<00:00, 109MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/quandang/vietnamese-foods/versions/11


FileNotFoundError: [Errno 2] No such file or directory: '/root/.cache/kagglehub/datasets/quandang/vietnamese-foods/versions/11/train'