In [13]:
import openai
import clip
import torch
from PIL import Image
import os
import spacy


In [10]:
!python -m spacy download en_core_web_sm

# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 435.7 kB/s eta 0:00:30
     --------------------------------------- 0.0/12.8 MB 326.8 kB/s eta 0:00:40
     --------------------------------------- 0.1/12.8 MB 655.4 kB/s eta 0:00:20
      --------------------------------------- 0.2/12.8 MB 1.1 MB/s eta 0:00:11
     - -------------------------------------- 0.5/12.8 MB 2.0 MB/s eta 0:00:07
     --- ------------------------------------ 1.0/12.8 MB 3.4 MB/s eta 0:00:04
     ---- ----------------------------------- 1.5/12.8 MB 4.5 MB/s eta 0:00:03
     ------ --------------------------------- 2.0/12.8 MB 5.4 MB/s eta 0:00:03
     ------- -------------------------------- 2.5/12.8 MB 5.8 MB/s eta 0:00:02
     --------- -----------------------

In [40]:
# Load the CLIP model onto the CPU
model, preprocess = clip.load("ViT-B/32", device='cpu')

# Load the NLP model (spaCy)
nlp = spacy.load("en_core_web_sm")

# Prepare the image
image_path = "Example Data-20240208T214429Z-001/Example Data/exported/(1) 18380579401063495.png"
image = preprocess(Image.open(image_path)).unsqueeze(0).to('cpu')

# Accept user input for text prompt and smart story description
user_text_prompt = "I love my motor bike"
smart_story_description = "Motor Bike Club"

# Process the texts with spaCy for named entity recognition (NER)
user_doc = nlp(user_text_prompt)
story_doc = nlp(smart_story_description)

# Extract entities, or use the original text if no entities are found
user_entities = ' '.join([ent.text for ent in user_doc.ents]) if user_doc.ents else user_text_prompt
story_entities = ' '.join([ent.text for ent in story_doc.ents]) if story_doc.ents else smart_story_description

# Tokenize the refined texts
text_inputs = clip.tokenize([user_entities, story_entities]).to('cpu')

# Calculate the features with CLIP
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_inputs)

    # Normalize the features to unit vectors
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

# Compute cosine similarity
cosine_similarities = torch.matmul(image_features, text_features.T).cpu().numpy().flatten()


# Print the cosine similarity scores
print("Cosine  Similarity Score -1 to 1")
print("1 indicates identical directionality (very similar),")
print("0 indicates orthogonality (not similar),")
print("and -1 indicates opposite directionality (very dissimilar)."  )
print(f"Cosine similarity score for user text prompt: {cosine_similarities[0]}")
print(f"Cosine similarity score for smart story description: {cosine_similarities[1]}")

Cosine  Similarity Score -1 to 1
1 indicates identical directionality (very similar),
0 indicates orthogonality (not similar),
and -1 indicates opposite directionality (very dissimilar).
Cosine similarity score for user text prompt: 0.26629164814949036
Cosine similarity score for smart story description: 0.25851428508758545
