In [23]:
import clip
import torch
from PIL import Image
import spacy
import pandas as  pd

In [17]:
# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")


In [18]:
# Load the CLIP model onto the CPU
model, preprocess = clip.load("ViT-B/32", device='cpu')

# Load the NLP model (spaCy)
nlp = spacy.load("en_core_web_sm")

# Setting up a base directory for the images 
BASE_IMG_DIR = "Example Data-20240208T214429Z-001\Example Data\exported"
image_name = "IMG_3155.JPG"
# Prepare the image
image_path = f"{BASE_IMG_DIR}/{image_name}"

image = preprocess(Image.open(image_path)).unsqueeze(0).to('cpu')

# Accept user input for text prompt and smart story description
user_text_prompt = "a photo of people standing outside"
smart_story_description = "Three individuals outdoors, smiling at the camera and pointing at a park bench."

# Process the texts with spaCy for named entity recognition (NER)
user_doc = nlp(user_text_prompt)
story_doc = nlp(smart_story_description)

# Extract entities, or use the original text if no entities are found
user_entities = ' '.join([ent.text for ent in user_doc.ents]) if user_doc.ents else user_text_prompt
story_entities = ' '.join([ent.text for ent in story_doc.ents]) if story_doc.ents else smart_story_description

# Tokenize the refined texts
text_inputs = clip.tokenize([user_entities, story_entities]).to('cpu')

# Calculate the features with CLIP
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_inputs)

    # Normalize the features to unit vectors
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

# Compute cosine similarity
cosine_similarities = torch.matmul(image_features, text_features.T).cpu().numpy().flatten()

# Print the cosine similarity scores
print("Cosine  Similarity Score -1 to 1")
print("1 indicates identical directionality (very similar),")
print("0 indicates orthogonality (not similar),")
print("and -1 indicates opposite directionality (very dissimilar)."  )
print(f"Cosine similarity score for user text prompt: {cosine_similarities[0]}")
print(f"Cosine similarity score for smart story description: {cosine_similarities[1]}")

  BASE_IMG_DIR = "Example Data-20240208T214429Z-001\Example Data\exported"


Cosine  Similarity Score -1 to 1
1 indicates identical directionality (very similar),
0 indicates orthogonality (not similar),
and -1 indicates opposite directionality (very dissimilar).
Cosine similarity score for user text prompt: 0.2476407289505005
Cosine similarity score for smart story description: 0.18213330209255219


In [19]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import os
import numpy as np

class LoadCocoDataset(Dataset):
    def __init__(self, image_dir, label_dir, transform=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transform = transform

        # List all image files in the directory
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg') or f.endswith('.png')]
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        # Load image
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        image = Image.open(img_path).convert("RGB")
        
        # Load label
        label_path = os.path.join(self.label_dir, self.image_files[idx].replace('.jpg', '.txt').replace('.png', '.txt'))
        
        # Assuming the labels are space-separated values: class x_center y_center width height
        labels = []
        if os.path.exists(label_path):
            with open(label_path, 'r') as file:
                for line in file:
                    labels.append(np.fromstring(line, sep=' '))

        labels = np.array(labels)  # Convert to a NumPy array

        if self.transform:
            image = self.transform(image)
        
        return image, labels

# Define your transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Initialize your dataset
coco_dataset = LoadCocoDataset(
    image_dir='datasets/coco128/images',
    label_dir='datasets/coco128/labels/train2017',
    transform=transform
)

# Initialize DataLoader
data_loader = DataLoader(coco_dataset, batch_size=4, shuffle=True)


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [20]:
for images, labels in data_loader:
    print(images, labels)# - images: a batch of images
    # - labels: the corresponding labels for each image
    


NameError: name 'data_loader' is not defined

In [26]:
df = pd.read_excel('Example Data-20240208T214429Z-001\Example Data\ImageLabels.xlsx', usecols=['Image Name', 'Confidence', 'Instance Count', 'Label'])

# Append the instance counts >=1 to the label, e.g: '1 Human'

# Filter based on the criteria
filtered_df = df[(df['Confidence'] >= 80) & (df['Instance Count'] > 0)]

# Append instance count to label
filtered_df['Label'] = filtered_df['Instance Count'].astype(str) + ' ' + filtered_df['Label']

# Group by 'Image Name' and aggregate labels into a list
aggregated_df = filtered_df.groupby('Image Name')['Label'].apply(list).reset_index()

# Renaming 'Label' column to 'Labels' to reflect the aggregated data
aggregated_df.columns = ['Image Name', 'Labels']

# Display the aggregated dataframe
aggregated_df.head()

  df = pd.read_excel('Example Data-20240208T214429Z-001\Example Data\ImageLabels.xlsx', usecols=['Image Name', 'Confidence', 'Instance Count', 'Label'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Label'] = filtered_df['Instance Count'].astype(str) + ' ' + filtered_df['Label']


Unnamed: 0,Image Name,Labels
0,(1) @GreyCupFestival - 109th Grey Cup.jpeg,"[2 Adult, 1 Female, 7 Person, 1 Woman, 1 Male,..."
1,(10) 17887803224903630.jpeg,"[1 Horse, 1 Person, 1 Adult, 1 Female, 1 Woman]"
2,(12) 17985809330117499.jpeg,"[1 Person, 1 Helmet, 1 Motorcycle]"
3,(13) 18013990822817757.jpeg,[3 Passport]
4,(14) 17993584322154200.jpeg,"[2 Person, 6 Bird, 1 Glove]"


In [27]:
labels_dict = aggregated_df.set_index('Image Name').to_dict()['Labels']

# Show the first few elements of the dictionary to verify
list(labels_dict.items())[:5]

[('(1) @GreyCupFestival - 109th Grey Cup.jpeg',
  ['2 Adult',
   '1 Female',
   '7 Person',
   '1 Woman',
   '1 Male',
   '1 Man',
   '1 Helmet',
   '1 Coat',
   '1 Shoe']),
 ('(10) 17887803224903630.jpeg',
  ['1 Horse', '1 Person', '1 Adult', '1 Female', '1 Woman']),
 ('(12) 17985809330117499.jpeg', ['1 Person', '1 Helmet', '1 Motorcycle']),
 ('(13) 18013990822817757.jpeg', ['3 Passport']),
 ('(14) 17993584322154200.jpeg', ['2 Person', '6 Bird', '1 Glove'])]

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

# Assuming 'user_text_prompt' and 'smart_story_descriptions' are provided as examples
user_text_prompt = "My new motor bike"
smart_story_descriptions = [
    "Motor Bike club"
]

# Combine the text inputs with image labels for tokenization
# First, convert the labels dictionary into a single string per image
labels_texts = ['; '.join(labels) for labels in labels_dict.values()]

# Combine all texts into a single list for vectorization
all_texts = [user_text_prompt] + smart_story_descriptions + labels_texts

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Tokenize and vectorize the texts
X = vectorizer.fit_transform(all_texts)

# Convert the result to an array to see the tokenized form
tokenized_array = X.toarray()

# Show the shape of the tokenized array as a simple verification step
tokenized_array.shape

(29, 49)

In [32]:
from transformers import AutoTokenizer, AutoModel
import torch

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def encode_texts(texts):
    """
    Encode a list of texts into embeddings using a pre-trained BERT model.
    """
    # Tokenize the texts
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # Generate embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    # Use the pooled output for sentence embeddings
    embeddings = model_output.pooler_output
    return embeddings

# Assuming 'all_texts' contains our user_text_prompt, smart_story_descriptions, and image labels combined
embeddings = encode_texts(all_texts)

# Compute cosine similarity between embeddings
# Assuming we have the embeddings for the user_text_prompt and all image labels
cosine_similarities = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1:].detach().numpy())

# Find the most similar image
max_similarity_index = cosine_similarities.argmax()
most_similar_image_name = list(labels_dict.keys())[max_similarity_index]

# Display the most similar image name and its similarity score
most_similar_image_name, cosine_similarities[0, max_similarity_index]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


('(14) 17993584322154200.jpeg', 0.96890765)