<a href="https://colab.research.google.com/github/Goutamkumar08/GenAI/blob/main/Semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing important Libraries

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import torch
import torchvision.transforms as transforms


Installing FAISS, importing models, using NumPy.

In [None]:
!pip install faiss-cpu
import faiss
from torchvision import models
import numpy as np



Scrape, filter, extract, index, search.

In [None]:
def scrape_images(url, output_folder="images"):
    """Scrapes images from a webpage and saves them."""
    os.makedirs(output_folder, exist_ok=True) # Create the output folder if it does not exist
    response = requests.get(url)   # Send an HTTP request to fetch the webpage content
    soup = BeautifulSoup(response.text, 'html.parser')
    img_tags = soup.find_all('img')  # Find all image tags in the webpage

    images = []
    for i, img_tag in enumerate(img_tags):
        img_url = img_tag.get('src')
        if img_url and img_url.startswith(('http', 'https')):  # Ensure the image URL is valid and has a proper format
            try:
                img_data = requests.get(img_url).content    # Fetch image content
                img_path = os.path.join(output_folder, f"image_{i}.jpg")   # Define the path to save the image
                with open(img_path, 'wb') as f:   # Save the image to the local folder
                    f.write(img_data)
                images.append(img_path)
            except Exception as e:
                print(f"Skipping {img_url}: {e}") # handling download errors
    return images

def is_advertisement(image_path):
    """Placeholder function to detect ads based on image size or content."""
    img = Image.open(image_path)
    if img.size[0] * img.size[1] < 5000:  #  Remove very small images
        return True
    return False

def filter_images(image_paths):
    """Removes images classified as advertisements."""
    return [img for img in image_paths if not is_advertisement(img)]

def extract_features(image_paths):
    """Extracts embeddings using a pretrained ResNet model."""
    model = models.resnet50(pretrained=True) # load ResNet-50 model
    model = torch.nn.Sequential(*(list(model.children())[:-1]))
    model.eval()

 # Define the image preprocessing transformations
    transform = transforms.Compose([
        transforms.Resize((224, 224)), # resizing to match ResNet input size
        transforms.ToTensor(), # converting images to tensor
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    features = []
    for image_path in image_paths:
        img = Image.open(image_path).convert('RGB')  # ensure image is in RGB format
        img = transform(img).unsqueeze(0)
        with torch.no_grad(): # Disable gradient computation for efficiency
            feature = model(img).squeeze().numpy()
        features.append(feature)

    return np.array(features)

def create_faiss_index(features):
    """Creates a FAISS index for semantic search."""
    d = features.shape[1] #getting feature dimension size
    index = faiss.IndexFlatL2(d) # L2 (euclidean) distance- based index
    index.add(features) # adding feature vector to the index
    return index

def search_images(query_image, image_paths, index):
    """Finds similar images using FAISS semantic search."""
    model = models.resnet50(pretrained=True)
    model = torch.nn.Sequential(*(list(model.children())[:-1]))  # Remove FC layer
    model.eval()

# Define preprocessing transformations for the query image
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

   # loading and preprocessing the query image
    img = Image.open(query_image).convert('RGB')
    img = transform(img).unsqueeze(0)

    with torch.no_grad():
        query_feature = model(img).squeeze().numpy()

  # Reshape the feature vector to match FAISS input requirements
    query_feature = np.expand_dims(query_feature, axis=0).astype('float32')

    assert query_feature.shape[1] == index.d, f"Query feature dimension {query_feature.shape[1]} does not match FAISS index dimension {index.d}"

    _, indices = index.search(query_feature, 5)  # Return top 5 results
    return [image_paths[i] for i in indices[0]]



In [None]:
    website_url = "https://www.amazon.in/"
    images = scrape_images(website_url)
    filtered_images = filter_images(images)
    features = extract_features(filtered_images)
    index = create_faiss_index(features)





In [None]:
filtered_images


['images/image_0.jpg',
 'images/image_1.jpg',
 'images/image_2.jpg',
 'images/image_3.jpg',
 'images/image_4.jpg',
 'images/image_5.jpg',
 'images/image_6.jpg',
 'images/image_7.jpg',
 'images/image_8.jpg',
 'images/image_9.jpg',
 'images/image_10.jpg',
 'images/image_11.jpg',
 'images/image_12.jpg',
 'images/image_13.jpg',
 'images/image_14.jpg',
 'images/image_15.jpg',
 'images/image_16.jpg']

In [None]:
query_img ="/content/query_images/query_image.jpeg"
similar_images = search_images(query_img, filtered_images, index)
print("Similar images:", similar_images)

Similar images: ['images/image_15.jpg', 'images/image_0.jpg', 'images/image_6.jpg', 'images/image_11.jpg', 'images/image_8.jpg']
