In [1]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

In [2]:
# Load the pretrained CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [3]:
language_prompts = [
    "This image contains text in English",
    "This image contains text in Spanish",
    "This image contains text in French",
    "This image contains text in Korean",
    "This image contains text in Chinese",
    "This image contains text in Arabic",
]

In [4]:
def preprocess_image(image_path):
    # Load the image
    image = Image.open(image_path).convert("RGB")
    # Preprocess the image for CLIP
    return processor(images=image, return_tensors="pt", padding=True)


In [5]:
def detect_language(image_path, language_prompts):
    inputs = preprocess_image(image_path)

    # Generate image embeddings
    image_features = model.get_image_features(**inputs)
    image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)

    # Generate text embeddings
    text_inputs = processor(text=language_prompts, return_tensors="pt", padding=True)
    text_features = model.get_text_features(**text_inputs)
    text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)

    # Compute similarity between image and text embeddings
    similarities = (image_features @ text_features.T).squeeze()

    # Identify the most similar language
    best_match_idx = similarities.argmax().item()
    return language_prompts[best_match_idx], similarities[best_match_idx].item()


In [6]:
image_path = "sign.jpg"  # Replace with your image path
detected_language, confidence = detect_language(image_path, language_prompts)
print(f"Detected Language: {detected_language}")
print(f"Confidence: {confidence}")


Detected Language: This image contains text in English
Confidence: 0.20702892541885376


In [8]:
import cv2
import pytesseract
# Specify the path to the tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

def extract_text_regions(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text_regions = pytesseract.image_to_boxes(gray)  # Get bounding boxes
    # Crop text regions and return
    return text_regions
# USE THIS IN PIPELINE TO CROP OUT PARTS WITH TEXT THEN PASS IN, USE COORDS TO DRAW WITH LANGUAGE????
print(extract_text_regions(image_path))

‘ 19 91 77 183 0
o 99 108 129 137 0
e 58 86 275 183 0
i 99 2 200 137 0
T 46 46 69 90 0
t 64 43 75 90 0
0 83 56 109 95 0
4 118 39 124 60 0
3 127 42 137 65 0
0 139 48 150 71 0
0 153 54 164 77 0

