In [2]:
import numpy as np
import pandas as pd
from openai import OpenAI
import spacy
import string

In [4]:
df = pd.read_csv('Descriptions.csv')
df

Unnamed: 0,Image,Description,Class
0,n07718472_1000.JPEG,Pickles submerged in brine with fresh dill.,cucumber
1,n07718472_1001.JPEG,"Shrimp salad with cucumbers, radishes, and herbs.",cucumber
2,n07718472_1002.JPEG,Close-up of seeds within a fruit.,cucumber
3,n07718472_10050.JPEG,Whole cucumber with sliced pieces arranged nea...,cucumber
4,n07718472_1006.JPEG,Cucumber and cherry tomatoes on a countertop.,cucumber
...,...,...,...
2355,n07716358_9983.JPEG,Hand holding a large green zucchini outside.,zucchini
2356,n07716358_9984.JPEG,Large green zucchini among garden foliage.,zucchini
2357,n07716358_9995.JPEG,Fresh green zucchinis stacked on a table.,zucchini
2358,n07716358_9996.JPEG,Stuffed zucchini with ground meat and basil.,zucchini


In [None]:
nlp = spacy.load('en_core_web_sm')
def preprocess_text_spacy(text):
    doc = nlp(text)
    lemmatized_tokens = []
    for token in doc:
        # Use token.lemma_ but handle pronouns and punctuation properly
        if token.pos_ == 'PRON':
            lemmatized_tokens.append(token.text)
        else:
            lemmatized_tokens.append(token.lemma_)
    return ' '.join(lemmatized_tokens)

df['Description_processed'] = df['Description'].apply(preprocess_text_spacy)
df['Description_processed'] = df['Description_processed'].str.replace(f"[{string.punctuation}]", "", regex=True)
df['Description_processed'] = df['Description_processed'].str.replace(r'\s{2,}', ' ', regex=True)

In [None]:
client = OpenAI(api_key='API_KEY')

def get_embeddings(sentence, model="text-embedding-3-small"):
    words = sentence.split()
    embeddings = []

    for word in words:
        try:
            response = client.embeddings.create(input=[word], model=model)
            word_embedding = np.array(response.data[0].embedding)
            embeddings.append(word_embedding)
        except Exception as e:
            print(f"Error processing word '{word}': {e}")

    return embeddings

def average_embeddings(embeddings):
    if embeddings:
        return np.mean(embeddings, axis=0)

df['Embeddings'] = df['Description_processed'].apply(lambda x: get_embeddings(x))
# Then, compute the average embedding for each sentence
df['Sentence_Average_Embeddings'] = df['Embeddings'].apply(average_embeddings)


In [None]:
df.to_csv('Embeddings.csv', index=False)

In [4]:
df_embeddings = pd.read_csv('Embeddings.csv')
df_embeddings

Unnamed: 0,Image,Description,Class,Description_processed,Sentence_Average_Embeddings,Embeddings
0,n07718472_1000.JPEG,Pickles submerged in brine with fresh dill.,cucumber,pickle submerge in brine with fresh dill .,[-2.05546785e-02 2.83203125e-02 8.56410414e-...,"[array([ 1.38092041e-03, 9.32617188e-02, 3.3..."
1,n07718472_1001.JPEG,"Shrimp salad with cucumbers, radishes, and herbs.",cucumber,"shrimp salad with cucumber , radish , and herb .",[-0.0916748 0.12235514 0.00669352 0.258626...,"[array([-0.27539062, 0.04907227, -0.13671875,..."
2,n07718472_1002.JPEG,Close-up of seeds within a fruit.,cucumber,close - up of seed within a fruit .,[-4.71740738e-02 1.14013674e-02 1.08154297e-...,"[array([-0.00320435, -0.07080078, 0.08984375,..."
3,n07718472_10050.JPEG,Whole cucumber with sliced pieces arranged nea...,cucumber,whole cucumber with sliced piece arrange neatly .,[-4.93512815e-03 2.09437776e-02 -6.33283332e-...,"[array([ 0.07519531, -0.0189209 , -0.00537109,..."
4,n07718472_1006.JPEG,Cucumber and cherry tomatoes on a countertop.,cucumber,Cucumber and cherry tomato on a countertop .,[-1.42114252e-01 1.18408201e-03 -1.26953120e-...,"[array([-1.93359375e-01, -1.76757812e-01, -1.5..."
...,...,...,...,...,...,...
263,n07718472_13944.JPEG,Grilled chicken with mixed vegetables on plate.,cucumber,grill chicken with mixed vegetable on plate .,[-7.32596293e-02 4.72760871e-02 7.58579792e-...,"[array([-1.37329102e-02, 2.89062500e-01, 1.0..."
264,n07718472_13950.JPEG,"Two fresh, green cucumbers with water droplets.",cucumber,"two fresh , green cucumber with water droplet .",[ 1.35323657e-02 5.25599904e-02 8.89500231e-...,"[array([ 0.03173828, -0.10644531, 0.00241089,..."
265,n07718472_13958.JPEG,Grilled chicken breast with cucumber and carrots.,cucumber,grill chicken breast with cucumber and carrot .,[-0.04220581 0.1983846 0.05832927 0.146972...,"[array([-1.37329102e-02, 2.89062500e-01, 1.0..."
266,n07718472_13974.JPEG,Fresh green cucumbers arranged artistically on...,cucumber,fresh green cucumber arrange artistically on d...,[-2.24958151e-03 3.20870541e-02 -5.35365520e-...,"[array([-0.04223633, 0.01806641, 0.22070312,..."
