In [36]:
import pandas as pd

# Step 1: Load the ImageLabels.xlsx file into a pandas dataframe.
LABELS_DIR_PATH = 'Example Data-20240208T214429Z-001\Example Data\ImageLabels.xlsx'
df = pd.read_excel(LABELS_DIR_PATH, usecols=['Image Name', 'Confidence', 'Instance Count', 'Label'])

# Step 2: Process the dataframe.
# Remove labels with a confidence value < 80 and instance count of 0.
filtered_df = df[(df['Confidence'] >= 80) & (df['Instance Count'] > 0)]

# Append the instance counts to the label.
filtered_df['Labels'] = filtered_df['Instance Count'].astype(str) + ' ' + filtered_df['Label']

# Aggregate labels for each image into a single row.
aggregated_df = filtered_df.groupby('Image Name')['Labels'].apply(lambda x: ', '.join(x)).reset_index()

aggregated_df.shape # Checking the size of the labels file. 
aggregated_df.head(5)

  LABELS_DIR_PATH = 'Example Data-20240208T214429Z-001\Example Data\ImageLabels.xlsx'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Labels'] = filtered_df['Instance Count'].astype(str) + ' ' + filtered_df['Label']


Unnamed: 0,Image Name,Labels
0,(1) @GreyCupFestival - 109th Grey Cup.jpeg,"2 Adult, 1 Female, 7 Person, 1 Woman, 1 Male, ..."
1,(10) 17887803224903630.jpeg,"1 Horse, 1 Person, 1 Adult, 1 Female, 1 Woman"
2,(12) 17985809330117499.jpeg,"1 Person, 1 Helmet, 1 Motorcycle"
3,(13) 18013990822817757.jpeg,3 Passport
4,(14) 17993584322154200.jpeg,"2 Person, 6 Bird, 1 Glove"


In [37]:
# Convert to numpy array
labels_array = aggregated_df.to_numpy()

# Seems to make help the model make better predictions.
LABELS_CONTEXT_STRING = 'A picture containing: '
# Convert to dictionary
labels_dict = dict(zip(aggregated_df['Image Name'], LABELS_CONTEXT_STRING + aggregated_df['Labels']))

print(labels_dict)

{'(1) @GreyCupFestival - 109th Grey Cup.jpeg': 'A picture containing: 2 Adult, 1 Female, 7 Person, 1 Woman, 1 Male, 1 Man, 1 Helmet, 1 Coat, 1 Shoe', '(10) 17887803224903630.jpeg': 'A picture containing: 1 Horse, 1 Person, 1 Adult, 1 Female, 1 Woman', '(12) 17985809330117499.jpeg': 'A picture containing: 1 Person, 1 Helmet, 1 Motorcycle', '(13) 18013990822817757.jpeg': 'A picture containing: 3 Passport', '(14) 17993584322154200.jpeg': 'A picture containing: 2 Person, 6 Bird, 1 Glove', '(16) 18379894042056715.jpeg': 'A picture containing: 2 Person, 2 Shaker, 1 Adult, 1 Female, 1 Woman, 1 Baseball Cap', '(17) 17876557646942156.jpeg': 'A picture containing: 3 Person, 2 Adult, 2 Male, 2 Man, 1 Glove, 1 Helmet, 4 Shoe', '(18) 17956303754673865.jpeg': 'A picture containing: 1 Helmet, 1 Person, 1 Adult, 1 Male, 1 Man, 2 Glove', '(19) 18027245935589987.jpeg': 'A picture containing: 1 Helmet, 1 Adult, 1 Male, 1 Man, 1 Person, 2 Glove, 2 Shoe', '(2) 18040499875507660.jpeg': 'A picture containing

In [38]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# Example user input and smart story description for a given image
user_text_input = "A picture of a person on a motorcycle."
smart_story_description = "Adam enjoying the beautiful day on his motorcycle."

# Assuming you're working with a single image and its labels
image_name = "(12) 17985809330117499.jpeg" # motorcycle image, just for testing
image_labels = labels_dict.get(image_name, "")

# Preprocess and tokenize
texts = [user_text_input, smart_story_description, image_labels]

In [40]:
def get_sentence_embeddings(texts, model, tokenizer):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states  # Get hidden states
        # Use the embeddings of the last layer's `[CLS]` token for sentence representation
        sentence_embedding = hidden_states[-1][:, 0, :].squeeze().detach().numpy()
        embeddings.append(sentence_embedding)
    return embeddings

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Assuming you've obtained embeddings
embeddings = get_sentence_embeddings(texts, model, tokenizer)

# Compute cosine similarity
# Note: embeddings[0] is user_text_input, embeddings[1] is smart_story_description, embeddings[2] is image_labels
similarity_input_label = cosine_similarity([embeddings[0]], [embeddings[2]])[0][0]
similarity_story_label = cosine_similarity([embeddings[1]], [embeddings[2]])[0][0]

print(f"Similarity between User Text Input and Labels: {similarity_input_label}")
print(f"Similarity between Smart Story Description and Labels: {similarity_story_label}")


Similarity between User Text Input and Labels: 0.8478407859802246
Similarity between Smart Story Description and Labels: 0.726824164390564
