In [12]:
import clip
import torch
from PIL import Image
import spacy


In [13]:
# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")


In [25]:
# Load the CLIP model onto the CPU
model, preprocess = clip.load("ViT-B/32", device='cpu')

# Load the NLP model (spaCy)
nlp = spacy.load("en_core_web_sm")

# Setting up a base directory for the images 
BASE_IMG_DIR = "Example Data-20240208T214429Z-001\Example Data\exported"
image_name = "IMG_3155.JPG"
# Prepare the image
image_path = f"{BASE_IMG_DIR}/{image_name}"

image = preprocess(Image.open(image_path)).unsqueeze(0).to('cpu')

# Accept user input for text prompt and smart story description
user_text_prompt = "a photo of people standing outside"
smart_story_description = "Three individuals outdoors, smiling at the camera and pointing at a park bench."

# Process the texts with spaCy for named entity recognition (NER)
user_doc = nlp(user_text_prompt)
story_doc = nlp(smart_story_description)

# Extract entities, or use the original text if no entities are found
user_entities = ' '.join([ent.text for ent in user_doc.ents]) if user_doc.ents else user_text_prompt
story_entities = ' '.join([ent.text for ent in story_doc.ents]) if story_doc.ents else smart_story_description

# Tokenize the refined texts
text_inputs = clip.tokenize([user_entities, story_entities]).to('cpu')

# Calculate the features with CLIP
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_inputs)

    # Normalize the features to unit vectors
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

# Compute cosine similarity
cosine_similarities = torch.matmul(image_features, text_features.T).cpu().numpy().flatten()

# Print the cosine similarity scores
print("Cosine  Similarity Score -1 to 1")
print("1 indicates identical directionality (very similar),")
print("0 indicates orthogonality (not similar),")
print("and -1 indicates opposite directionality (very dissimilar)."  )
print(f"Cosine similarity score for user text prompt: {cosine_similarities[0]}")
print(f"Cosine similarity score for smart story description: {cosine_similarities[1]}")

  BASE_IMG_DIR = "Example Data-20240208T214429Z-001\Example Data\exported"


Cosine  Similarity Score -1 to 1
1 indicates identical directionality (very similar),
0 indicates orthogonality (not similar),
and -1 indicates opposite directionality (very dissimilar).
Cosine similarity score for user text prompt: 0.24764074385166168
Cosine similarity score for smart story description: 0.18213334679603577


In [24]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import os
import numpy as np

class LoadCocoDataset(Dataset):
    def __init__(self, image_dir, label_dir, transform=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transform = transform

        # List all image files in the directory
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg') or f.endswith('.png')]
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        # Load image
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        image = Image.open(img_path).convert("RGB")
        
        # Load label
        label_path = os.path.join(self.label_dir, self.image_files[idx].replace('.jpg', '.txt').replace('.png', '.txt'))
        
        # Assuming the labels are space-separated values: class x_center y_center width height
        labels = []
        if os.path.exists(label_path):
            with open(label_path, 'r') as file:
                for line in file:
                    labels.append(np.fromstring(line, sep=' '))

        labels = np.array(labels)  # Convert to a NumPy array

        if self.transform:
            image = self.transform(image)
        
        return image, labels

# Define your transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Initialize your dataset
coco_dataset = LoadCocoDataset(
    image_dir='datasets/coco128/images',
    label_dir='datasets/coco128/labels/train2017',
    transform=transform
)

# Initialize DataLoader
data_loader = DataLoader(coco_dataset, batch_size=4, shuffle=True)


0


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [None]:
for images, labels in data_loader:
    print(images, labels)# - images: a batch of images
    # - labels: the corresponding labels for each image
    
