In [11]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from scipy.spatial.distance import cosine

In [12]:
prompts = ['Beautiful mountain landscape',
'Beautiful mountain landscape with a setting sun',
'Beautiful mountain landscape at dusk with a few stars appearing',
'Beautiful mountain landscape under a full starry night sky']

In [13]:
subprompts = [
    "A beautiful mountain landscape with a bright, clear sky overhead",
    "A beautiful mountain landscape, the sun beginning its descent from the peak",
    "A beautiful mountain landscape, the sun inching closer to the horizon, casting longer shadows",
    "A beautiful mountain landscape, the sun's rays turning a soft golden hue",
    "A beautiful mountain landscape, the sun halfway set, painting the sky with warm colors",
    "A beautiful mountain landscape, the sun dipping lower, the sky blushing with oranges and pinks",
    "A beautiful mountain landscape, the sun barely visible, the sky a canvas of twilight hues",
    "A beautiful mountain landscape, the sun almost set, the first stars twinkling faintly",
    "A beautiful mountain landscape, the sun disappearing, the sky a deepening blue",
    "A beautiful mountain landscape, the sun just set, the afterglow illuminating the horizon",
    "A beautiful mountain landscape, the sky transitioning from dusk to night, stars becoming more visible",
    "A beautiful mountain landscape, the night sky taking over, the mountain silhouetted against the stars",
    "A beautiful mountain landscape under a starry night, the moon beginning to rise",
    "A beautiful mountain landscape, the moon casting a gentle glow over the peaks",
    "A beautiful mountain landscape at night, the moonlight illuminating the mountain's contours",
    "A beautiful mountain landscape, the moon high in the sky, the air cool and still",
    "A beautiful mountain landscape under the moon's soft light, the stars shining brightly",
    "A beautiful mountain landscape, the moon's glow highlighting the mountain's rugged beauty",
    "A beautiful mountain landscape at night, the moon casting long, dramatic shadows",
    "A beautiful mountain landscape, the moon reaching its zenith, the night at its darkest",
    "A beautiful mountain landscape under the moon's watchful eye, the night calm and serene",
    "A beautiful mountain landscape, the moon's glow painting the mountain a silvery blue",
    "A beautiful mountain landscape at night, the moon's light reflecting off the mountain's snowy peaks",
    "A beautiful mountain landscape, the moon's glow creating a tranquil atmosphere",
    "A beautiful mountain landscape at night, the moon's light casting a magical aura",
    "A beautiful mountain landscape, the moon's glow illuminating the mountain's majestic form",
    "A beautiful mountain landscape at night, the moon's light creating a peaceful ambiance",
    "A beautiful mountain landscape, the moon's glow highlighting the mountain's natural beauty",
    "A beautiful mountain landscape at night, the moon's light casting a serene glow",
    "A beautiful mountain landscape, the moon's glow illuminating the mountain's grandeur",
    "A beautiful mountain landscape at night, the moon's light creating a picturesque scene",
    "A beautiful mountain landscape, the moon's glow highlighting the mountain's splendor under the starry night sky"
]


In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [15]:
def get_bert_embeddings(sentence):
    # Tokenize the sentence and get input IDs, without truncation
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=False, max_length=512)

    # Check if the input exceeds the token limit and handle it
    if inputs['input_ids'].shape[1] > 512:
        print(f"Warning: The sentence exceeds the maximum token limit of 512 tokens.")

    # Get the embeddings from BERT
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the last hidden state (embeddings)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings


In [16]:
total = 0
for i in range(len(prompts)-1):
  embedding1 = get_bert_embeddings(prompts[i])
  embedding2 = get_bert_embeddings(prompts[i+1])
  cos_sim = 1 - cosine(embedding1, embedding2)
  print(f"Cosine Similarity {i+1}: {cos_sim:.4f}")
  total += cos_sim

print(f"Avg cosine Similarity: {(total/(len(prompts)-1)):.4f}")

Cosine Similarity 1: 0.8813
Cosine Similarity 2: 0.9003
Cosine Similarity 3: 0.8853
Avg cosine Similarity: 0.8890


In [17]:
total = 0
for i in range(len(subprompts)-1):
  embedding1 = get_bert_embeddings(subprompts[i])
  embedding2 = get_bert_embeddings(subprompts[i+1])
  cos_sim = 1 - cosine(embedding1, embedding2)
  print(f"Cosine Similarity {i+1}: {cos_sim:.4f}")
  total += cos_sim

print(f"Avg cosine Similarity: {(total/(len(subprompts)-1)):.4f}")

Cosine Similarity 1: 0.8491
Cosine Similarity 2: 0.8934
Cosine Similarity 3: 0.8944
Cosine Similarity 4: 0.9297
Cosine Similarity 5: 0.9201
Cosine Similarity 6: 0.8982
Cosine Similarity 7: 0.8934
Cosine Similarity 8: 0.9156
Cosine Similarity 9: 0.9012
Cosine Similarity 10: 0.8484
Cosine Similarity 11: 0.8999
Cosine Similarity 12: 0.8844
Cosine Similarity 13: 0.8817
Cosine Similarity 14: 0.9229
Cosine Similarity 15: 0.9007
Cosine Similarity 16: 0.9260
Cosine Similarity 17: 0.8971
Cosine Similarity 18: 0.8790
Cosine Similarity 19: 0.9092
Cosine Similarity 20: 0.9039
Cosine Similarity 21: 0.8682
Cosine Similarity 22: 0.9354
Cosine Similarity 23: 0.9043
Cosine Similarity 24: 0.9537
Cosine Similarity 25: 0.9288
Cosine Similarity 26: 0.9081
Cosine Similarity 27: 0.9220
Cosine Similarity 28: 0.9272
Cosine Similarity 29: 0.9343
Cosine Similarity 30: 0.9251
Cosine Similarity 31: 0.9138
Avg cosine Similarity: 0.9055
