In [102]:
import torch
import clip
from PIL import Image
import requests
from io import BytesIO
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import numpy as np
from torchvision.transforms import RandomHorizontalFlip, ColorJitter
import torch.nn.functional as F


# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load CLIP model
model, clip_preprocess = clip.load("ViT-L/14", device=device, jit=False)
print("CLIP model loaded successfully")

# Custom preprocessing function
def preprocess(image):
    transform = Compose([
        Resize(224),
        CenterCrop(224),
        RandomHorizontalFlip(p=0.0),
        ColorJitter(brightness=0.1, contrast=0.2),
        ToTensor(),
        Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
    ])
    return transform(image).unsqueeze(0).to(device)
"""def preprocess(image):
    return clip_preprocess(image).unsqueeze(0).to(device)"""

# Function to encode image
def encode_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image_input = preprocess(image)
    with torch.no_grad():
        image_features = model.encode_image(image_input)
    #print(image_features)
    return image_features / image_features.norm(dim=-1, keepdim=True)

# Function to encode text
def encode_text(text):
    text_input = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_input)
    return text_features / text_features.norm(dim=-1, keepdim=True)
"""def encode_multiple_texts(prompts):
    text_inputs = clip.tokenize(prompts).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    return text_features.mean(dim=0, keepdim=True)"""

# Function to compute similarity
"""def compute_similarity(image_features, text_features):
    return (100.0 * image_features @ text_features.T).item()"""
def compute_similarity(image_features, text_features):
    return F.cosine_similarity(image_features, text_features).item() * 100

# Function to perform web search
def perform_web_search(query):
    url = f"https://api.duckduckgo.com/?q={query}&format=json&pretty=1"
    response = requests.get(url)
    results = response.json()
    
    print("Full search response:", results)  # Added for debugging
    
    return results.get('RelatedTopics', [])


# Function to filter results
def filter_results(search_results, original_image_features, threshold=0.1):
    filtered_results = []
    for result in search_results:
        # Check if there's a valid image URL in the result
        if 'Icon' in result and 'URL' in result['Icon']:
            result_image_url = result['Icon']['URL']
            
            # Ensure the URL is not empty and starts with 'http' or 'https'
            if result_image_url and result_image_url.startswith(('http', 'https')):
                try:
                    response = requests.get(result_image_url)
                    result_image = Image.open(BytesIO(response.content)).convert("RGB")
                    result_image_input = preprocess(result_image)
                    
                    with torch.no_grad():
                        result_features = model.encode_image(result_image_input)
                        result_features /= result_features.norm(dim=-1, keepdim=True)
                    
                    similarity = compute_similarity(original_image_features, result_features)
                    
                    if similarity > threshold:
                        filtered_results.append({
                            'url': result['FirstURL'],
                            'title': result['Text'],
                            'similarity': similarity
                        })
                except Exception as e:
                    print(f"Error processing result image from URL '{result_image_url}': {e}")
        else:
            print(f"Skipping result due to missing or invalid image URL.")
    
    return sorted(filtered_results, key=lambda x: x['similarity'], reverse=True)


# Main function
def main(image_path, text_prompt):
    print(f"Processing image: {image_path}")
    print(f"Text prompt: {text_prompt}")
    
    # Encode image and text
    image_features = encode_image(image_path)
    text_features = encode_text(text_prompt)
    #text_features = encode_multiple_texts(text_prompt)
    
    # Compute similarity between image and text
    similarity = compute_similarity(image_features, text_features)
    print(f"Similarity between image and text: {similarity:.2f}%")
    
    # Generate search query
    search_query = text_prompt  # For simplicity, we're using the text prompt as the search query
    print(f"Search query: {search_query}")
    
    # Perform web search
    search_results = perform_web_search(search_query)
    print(search_results)
    print(f"Found {len(search_results)} search results")
    
    # Filter results
    filtered_results = filter_results(search_results, image_features)
    
    # Display results
    print("\nTop filtered results:")
    for i, result in enumerate(filtered_results[:5], 1):
        print(f"{i}. Title: {result['title']}")
        print(f"   URL: {result['url']}")
        print(f"   Similarity: {result['similarity']:.2f}%")
        print("---")

if __name__ == "__main__":
    # Example usage
    image_path = "red_hoodie.jpeg"  # Replace with your image path
    #text_prompt = ["red hoodie","a bright red hoodie"]
    text_prompt="A bright red hoodie"

    main(image_path, text_prompt)

Using device: cuda
CLIP model loaded successfully
Processing image: red_hoodie.jpeg
Text prompt: A bright red hoodie
Similarity between image and text: 28.10%
Search query: A bright red hoodie
Full search response: {'Abstract': '', 'AbstractSource': '', 'AbstractText': '', 'AbstractURL': '', 'Answer': '', 'AnswerType': '', 'Definition': '', 'DefinitionSource': '', 'DefinitionURL': '', 'Entity': '', 'Heading': '', 'Image': '', 'ImageHeight': '', 'ImageIsLogo': '', 'ImageWidth': '', 'Infobox': '', 'Redirect': '', 'RelatedTopics': [], 'Results': [], 'Type': '', 'meta': {'attribution': None, 'blockgroup': None, 'created_date': '2021-03-24', 'description': 'testing', 'designer': None, 'dev_date': '2021-03-24', 'dev_milestone': 'development', 'developer': [{'name': 'zt', 'type': 'duck.co', 'url': 'https://duck.co/user/zt'}], 'example_query': '', 'id': 'just_another_test', 'is_stackexchange': 0, 'js_callback_name': 'another_test', 'live_date': None, 'maintainer': {'github': ''}, 'name': 'Just

In [8]:
import torch
import clip
from PIL import Image
import requests
from io import BytesIO

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load CLIP model
model, clip_preprocess = clip.load("ViT-B/32", device=device, jit=False)
print("CLIP model loaded successfully")

# Custom preprocessing function
def preprocess(image):
    return clip_preprocess(image).unsqueeze(0).to(device)

# Function to encode image
def encode_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image_input = preprocess(image)
    with torch.no_grad():
        image_features = model.encode_image(image_input)
    return image_features / image_features.norm(dim=-1, keepdim=True)

# Function to encode text
def encode_text(text):
    text_input = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_input)
    return text_features / text_features.norm(dim=-1, keepdim=True)

# Function to compute similarity
def compute_similarity(image_features, text_features):
    return (100.0 * image_features @ text_features.T).item()

# Function to perform web search (DuckDuckGo API)
def perform_web_search(query):
    url = f"https://api.duckduckgo.com/?q={query}&format=json&pretty=1"
    response = requests.get(url)
    results = response.json()
    return results.get('RelatedTopics', [])

# Function to filter results by comparing both image and text features
def filter_results(search_results, original_image_features, threshold=0.1):
    filtered_results = []
    for result in search_results:
        # Check if there's a valid image URL in the result
        if 'Icon' in result and 'URL' in result['Icon']:
            result_image_url = result['Icon']['URL']
            
            # Ensure the URL is not empty and starts with 'http' or 'https'
            if result_image_url and result_image_url.startswith(('http', 'https')):
                try:
                    # Download and preprocess the image from the search result
                    response = requests.get(result_image_url)
                    result_image = Image.open(BytesIO(response.content)).convert("RGB")
                    result_image_input = preprocess(result_image)
                    
                    with torch.no_grad():
                        # Encode the image from the search result
                        result_features = model.encode_image(result_image_input)
                        result_features /= result_features.norm(dim=-1, keepdim=True)
                    
                    # Compute image similarity
                    image_similarity = compute_similarity(original_image_features, result_features)
                    
                    # Filter based on image similarity
                    if image_similarity > threshold:
                        filtered_results.append({
                            'url': result['FirstURL'],
                            'title': result['Text'],
                            'image_similarity': image_similarity
                        })
                except Exception as e:
                    print(f"Error processing result image from URL '{result_image_url}': {e}")
        else:
            print(f"Skipping result due to missing or invalid image URL.")
    
    return sorted(filtered_results, key=lambda x: x['image_similarity'], reverse=True)

# Main function to process both image and text for searching
def main(image_path, text_prompt):
    print(f"Processing image: {image_path}")
    print(f"Text prompt: {text_prompt}")
    
    # Encode both image and text
    image_features = encode_image(image_path)
    text_features = encode_text(text_prompt)
    
    # Compute similarity between image and text (for reference)
    similarity = compute_similarity(image_features, text_features)
    print(f"Similarity between image and text: {similarity:.2f}%")
    
    # Optionally: generate caption from image (or describe image manually)
    # For now, we're not using image captioning, but you could use models like BLIP or Donut
    
    # Combine the user text with some description about the image
    # This could be a description from the image captioning model
    search_query = f"{text_prompt}"  # You can expand this with extracted image captions
    print(f"Search query: {search_query}")
    
    # Perform web search based on the text prompt
    search_results = perform_web_search(search_query)
    print(f"Found {len(search_results)} search results")
    
    # Filter results based on image similarity
    filtered_results = filter_results(search_results, image_features)
    
    # Display the top filtered results
    print("\nTop filtered results:")
    for i, result in enumerate(filtered_results[:5], 1):
        print(f"{i}. Title: {result['title']}")
        print(f"   URL: {result['url']}")
        print(f"   Image Similarity: {result['image_similarity']:.2f}%")
        print("---")

# Example usage
if __name__ == "__main__":
    image_path = "red_hoodie.jpeg"  # Replace with the path to your image
    text_prompt = "red hoodie"      # Replace with your text query
    main(image_path, text_prompt)


Using device: cuda
CLIP model loaded successfully
Processing image: red_hoodie.jpeg
Text prompt: red hoodie
Similarity between image and text: 32.16%
Search query: red hoodie
Found 0 search results

Top filtered results:


In [1]:
import torch
import clip
from PIL import Image
import requests
from io import BytesIO


# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load CLIP model
model, clip_preprocess = clip.load("ViT-B/32", device=device, jit=False)
print("CLIP model loaded successfully")

# Custom preprocessing function
def preprocess(image):
    return clip_preprocess(image).unsqueeze(0).to(device)

# Function to encode image
def encode_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image_input = preprocess(image)
    with torch.no_grad():
        image_features = model.encode_image(image_input)
    return image_features / image_features.norm(dim=-1, keepdim=True)

# Function to encode text
def encode_text(text):
    text_input = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_input)
    return text_features / text_features.norm(dim=-1, keepdim=True)

# Function to compute similarity
def compute_similarity(image_features, text_features):
    return (100.0 * image_features @ text_features.T).item()

# Function to perform web search (DuckDuckGo API)
def perform_web_search(query):
    url = f"https://api.duckduckgo.com/?q={query}&format=json&pretty=1"
    response = requests.get(url)
    results = response.json()
    return results.get('RelatedTopics', [])

# Function to filter results by comparing both image and text features
def filter_results(search_results, original_image_features, original_text_features, threshold=0.0):
    filtered_results = []
    for result in search_results:
        # Check if there's a valid image URL in the result
        if 'Icon' in result and 'URL' in result['Icon']:
            result_image_url = result['Icon']['URL']
            
            # Ensure the URL is not empty and starts with 'http' or 'https'
            if result_image_url and result_image_url.startswith(('http', 'https')):
                try:
                    # Download and preprocess the image from the search result
                    response = requests.get(result_image_url)
                    result_image = Image.open(BytesIO(response.content)).convert("RGB")
                    result_image_input = preprocess(result_image)
                    
                    with torch.no_grad():
                        # Encode the image from the search result
                        result_features = model.encode_image(result_image_input)
                        result_features /= result_features.norm(dim=-1, keepdim=True)
                    
                    # Compute image similarity
                    image_similarity = compute_similarity(original_image_features, result_features)
                    
                    # Combine image similarity with text similarity for better ranking
                    text_similarity = compute_similarity(original_text_features, original_text_features)
                    combined_similarity = (image_similarity + text_similarity) / 2
                    
                    # Filter based on combined similarity
                    if combined_similarity > threshold:
                        filtered_results.append({
                            'url': result['FirstURL'],
                            'title': result['Text'],
                            'image_similarity': image_similarity,
                            'text_similarity': text_similarity,
                            'combined_similarity': combined_similarity
                        })
                except Exception as e:
                    print(f"Error processing result image from URL '{result_image_url}': {e}")
        else:
            print(f"Skipping result due to missing or invalid image URL.")
    
    return sorted(filtered_results, key=lambda x: x['combined_similarity'], reverse=True)

# Main function to process both image and text for searching
def main(image_path, text_prompt):
    print(f"Processing image: {image_path}")
    print(f"Text prompt: {text_prompt}")
    
    # Encode both image and text
    image_features = encode_image(image_path)
    text_features = encode_text(text_prompt)
    
    # Compute similarity between image and text (for reference)
    similarity = compute_similarity(image_features, text_features)
    print(f"Similarity between image and text: {similarity:.2f}%")
    
    # Generate search query using the text prompt
    search_query = text_prompt  # Using the text prompt for search query
    print(f"Search query: {search_query}")
    
    # Perform web search based on the text prompt
    search_results = perform_web_search(search_query)
    print(f"Found {len(search_results)} search results")
    
    # Filter results based on both image and text similarities
    filtered_results = filter_results(search_results, image_features, text_features)
    
    # Display the top filtered results
    print("\nTop filtered results:")
    for i, result in enumerate(filtered_results[:5], 1):
        print(f"{i}. Title: {result['title']}")
        print(f"   URL: {result['url']}")
        print(f"   Image Similarity: {result['image_similarity']:.2f}%")
        print(f"   Text Similarity: {result['text_similarity']:.2f}%")
        print(f"   Combined Similarity: {result['combined_similarity']:.2f}%")
        print("---")

# Example usage
if __name__ == "__main__":
    image_path = "red_hoodie.jpeg"  # Replace with the path to your image
    text_prompt = "red hoodie"      # Replace with your text query
    main(image_path, text_prompt)


Using device: cuda
CLIP model loaded successfully
Processing image: red_hoodie.jpeg
Text prompt: red hoodie


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Similarity between image and text: 32.16%
Search query: red hoodie
Found 0 search results

Top filtered results:


# contextual

In [8]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests

def image_to_text(image_path):
    # Load the BLIP model and processor
    device = "cuda" if torch.cuda.is_available() else "cpu"
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

    # Load and preprocess the image
    image_path = "red_hoodie.jpeg"  # Replace with your image path
    image = Image.open(image_path)

    # Preprocess the image and generate caption
    inputs = processor(image, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs)

    # Decode the generated caption
    caption = processor.decode(generated_ids[0], skip_special_tokens=True)
    return f"Generated Caption: {caption}"

"""def perform_web_search(query):
    url = f"https://api.duckduckgo.com/?q={query}&format=json&pretty=1"
    response = requests.get(url)
    results = response.json()
    return results.get('RelatedTopics', [])"""

import requests

# Define the function for web search using ContextualWebSearch API
def perform_web_search(query, num_results=5):
    url = "https://contextualwebsearch-websearch-v1.p.rapidapi.com/api/Search/WebSearchAPI"
    
    # Parameters for the search
    params = {
        "q": query,              # The search query
        "pageNumber": "1",       # Page number of the results
        "pageSize": num_results, # Number of results to return
        "autoCorrect": "true"    # Enable autocorrect in the search
    }
    
    # Headers with API key
    headers = {
        "X-RapidAPI-Key": "YOUR_API_KEY",  # Replace with your actual API key
        "X-RapidAPI-Host": "contextualwebsearch-websearch-v1.p.rapidapi.com"
    }
    
    # Send the request
    response = requests.get(url, headers=headers, params=params)
    
    # If the request was successful
    if response.status_code == 200:
        data = response.json()
        results = data.get('value', [])
        
        # Extracting the title, description, and URL from the search results
        for idx, result in enumerate(results, 1):
            title = result.get('title')
            description = result.get('description')
            url = result.get('url')
            print(f"{idx}. {title}\nDescription: {description}\nURL: {url}\n")
    else:
        print(f"Error: {response.status_code}, {response.text}")

if __name__ == "__main__":
    # Example usage
    image_path = "red_hoodie.jpeg"  # Replace with your image path
    #text_prompt = ["red hoodie","a bright red hoodie"]
    text_prompt="search for this types"
    search_query=image_to_text(image_path) + text_prompt

    perform_web_search(search_query,num_results=5)
    #print(f"Found {len(search_results)} search results")

# mojeek

In [11]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests

# Function to generate a text caption from an image using BLIP
def image_to_text(image_path):
    # Load the BLIP model and processor
    device = "cuda" if torch.cuda.is_available() else "cpu"
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

    # Load and preprocess the image
    image = Image.open(image_path)

    # Preprocess the image and generate caption
    inputs = processor(image, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs)

    # Decode the generated caption
    caption = processor.decode(generated_ids[0], skip_special_tokens=True)
    return caption

# Function to perform web search using Mojeek API
def perform_web_search(query):
    url = "https://www.mojeek.com/search"
    
    params = {
        "q": query,              # The search query
        "count": "5",            # Number of results to return
        "output": "json"         # Specify that the response should be in JSON format
    }
    
    headers = {
        "Authorization": "Bearer YOUR_API_KEY"  # Replace with your actual Mojeek API key
    }

    response = requests.get(url, headers=headers, params=params)
    
    # Check for errors
    if response.status_code == 200:
        data = response.json()
        results = data.get('results', [])
        
        # Print out the search results
        for idx, result in enumerate(results, 1):
            title = result.get('title')
            description = result.get('snippet')
            link = result.get('url')
            print(f"{idx}. {title}\nDescription: {description}\nURL: {link}\n")
    else:
        print(f"Error: {response.status_code}, {response.text}")

if __name__ == "__main__":
    # Example usage
    image_path = "red_hoodie.jpeg"  # Replace with your image path
    caption = image_to_text(image_path)
    text_prompt = "search for this type of product"
    
    # Combine the image caption with the text prompt to form a search query
    search_query = caption + " " + text_prompt

    # Perform web search
    perform_web_search(search_query)


Error: 404, {"message":"API doesn't exists"}


# duckduckgo

In [15]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests

def image_to_text(image_path):
    # Load the BLIP model and processor
    device = "cuda" if torch.cuda.is_available() else "cpu"
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

    # Load and preprocess the image
    image_path = "red_hoodie.jpeg"  # Replace with your image path
    image = Image.open(image_path)

    # Preprocess the image and generate caption
    inputs = processor(image, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs)

    # Decode the generated caption
    caption = processor.decode(generated_ids[0], skip_special_tokens=True)
    print(caption)
    return caption

# Define the function for web search using ContextualWebSearch API
def perform_web_search(query):
    url = f"https://api.duckduckgo.com/?q={query}&format=json&pretty=1"
    response = requests.get(url)
    results = response.json()
    return results.get('RelatedTopics', [])

if __name__ == "__main__":
    # Example usage
    image_path = "red_hoodie.jpeg"  # Replace with your image path
    #text_prompt = ["red hoodie","a bright red hoodie"]
    text_prompt="search for this types"
    search_query=image_to_text(image_path) + text_prompt

    search_results=perform_web_search(search_query)
    print(f"Found {len(search_results)} search results")



a red hoodie sweatshirt with a white hoodie
Found 0 search results
