In [2]:
import numpy as np
import pandas as pd
from openai import OpenAI
import spacy
import string

In [4]:
df = pd.read_csv('Descriptions.csv')
df

Unnamed: 0,Image,Description,Class
0,n07718472_1000.JPEG,Pickles submerged in brine with fresh dill.,cucumber
1,n07718472_1001.JPEG,"Shrimp salad with cucumbers, radishes, and herbs.",cucumber
2,n07718472_1002.JPEG,Close-up of seeds within a fruit.,cucumber
3,n07718472_10050.JPEG,Whole cucumber with sliced pieces arranged nea...,cucumber
4,n07718472_1006.JPEG,Cucumber and cherry tomatoes on a countertop.,cucumber
...,...,...,...
2355,n07716358_9983.JPEG,Hand holding a large green zucchini outside.,zucchini
2356,n07716358_9984.JPEG,Large green zucchini among garden foliage.,zucchini
2357,n07716358_9995.JPEG,Fresh green zucchinis stacked on a table.,zucchini
2358,n07716358_9996.JPEG,Stuffed zucchini with ground meat and basil.,zucchini


In [None]:
nlp = spacy.load('en_core_web_sm')
def preprocess_text_spacy(text):
    doc = nlp(text)
    lemmatized_tokens = []
    for token in doc:
        # Use token.lemma_ but handle pronouns and punctuation properly
        if token.pos_ == 'PRON':
            lemmatized_tokens.append(token.text)
        else:
            lemmatized_tokens.append(token.lemma_)
    return ' '.join(lemmatized_tokens)

df['Description_processed'] = df['Description'].apply(preprocess_text_spacy)
df['Description_processed'] = df['Description_processed'].str.replace(f"[{string.punctuation}]", "", regex=True)
df['Description_processed'] = df['Description_processed'].str.replace(r'\s{2,}', ' ', regex=True)

In [None]:
client = OpenAI(api_key='API_KEY')

def get_embeddings(sentence, model="text-embedding-3-small"):
    words = sentence.split()
    embeddings = []

    for word in words:
        try:
            response = client.embeddings.create(input=[word], model=model)
            word_embedding = np.array(response.data[0].embedding)
            embeddings.append(word_embedding)
        except Exception as e:
            print(f"Error processing word '{word}': {e}")

    return embeddings

def average_embeddings(embeddings):
    if embeddings:
        return np.mean(embeddings, axis=0)

df['Embeddings'] = df['Description_processed'].apply(lambda x: get_embeddings(x))
# Then, compute the average embedding for each sentence
df['Sentence_Average_Embeddings'] = df['Embeddings'].apply(average_embeddings)


In [None]:
df.to_csv('Embeddings.csv', index=False)

In [7]:
df_embeddings = pd.read_csv('Embeddings.csv')
df_embeddings

Unnamed: 0,Image,Description,Class,Description_processed,Sentence_Average_Embeddings,Embeddings
0,n07718472_21669.JPEG,Sliced cucumber arranged in a layered pattern.,cucumber,slice cucumber arrange in a layered pattern,[-8.43302440e-03 2.75065098e-02 -4.47387695e-...,"[array([-0.00299072, -0.11816406, 0.09179688,..."
1,n07718472_1877.JPEG,Three fresh cucumbers on brown soil.,cucumber,three fresh cucumber on brown soil,[-2.41800938e-02 9.73307267e-02 1.02244057e-...,"[array([ 0.04931641, -0.10009766, 0.00665283,..."
2,n07716358_6548.JPEG,Fresh green beans and zucchini in basket.,zucchini,fresh green bean and zucchini in basket,[-4.38639335e-02 1.42496750e-01 3.52579765e-...,"[array([-0.04223633, 0.01806641, 0.22070312,..."
3,n07716358_2652.JPEG,Green zucchini growing among lush green leaves.,zucchini,green zucchini grow among lush green leave,[ 0.06480189 0.17897251 0.0271868 0.109357...,"[array([ 5.90820312e-02, 2.02148438e-01, 1.7..."
4,n07716358_16709.JPEG,Fresh zucchini and yellow squash in crate.,zucchini,fresh zucchini and yellow squash in crate,[-0.08504232 0.14347331 0.03249105 0.142832...,"[array([-0.04223633, 0.01806641, 0.22070312,..."
...,...,...,...,...,...,...
2355,n07716358_17583.JPEG,Zucchini plant with green fruits and flowers.,zucchini,zucchini plant with green fruit and flower,[ 0.00215658 0.15433757 0.07944743 0.181355...,"[array([-0.08447266, 0.33203125, -0.02038574,..."
2356,n07718472_757.JPEG,Child holding a large cucumber outdoors.,cucumber,child hold a large cucumber outdoors,[ 0.03974609 0.04775391 -0.02401428 0.141748...,"[array([ 1.65039062e-01, -6.39648438e-02, -1.7..."
2357,n07718472_8074.JPEG,"Fresh vegetables: lettuce, tomatoes, cucumbers...",cucumber,fresh vegetable lettuce tomato cucumber vibran...,[-0.10674613 0.12381417 0.08051409 0.198582...,"[array([-0.04223633, 0.01806641, 0.22070312,..."
2358,n07716358_105.JPEG,Three fresh zucchinis resting on soil.,zucchini,three fresh zucchinis rest on soil,[ 0.00836182 0.07828776 0.10930379 0.104410...,"[array([ 0.04931641, -0.10009766, 0.00665283,..."
