# Uso de modelos de embeddings de OpenAI

## Instalación y carga de librerías

In [46]:
!pip install openai
!pip install tiktoken



In [2]:
from openai import OpenAI
from getpass import getpass
import os
import pandas as pd

In [63]:
client = OpenAI(api_key=OPENAI_API_KEY)

## Cargar dataset

In [3]:
df = pd.read_csv('../Corpus/generic-food.csv')

In [50]:
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables


## Evaluar cantidad de tokens a procesar

In [51]:
import tiktoken



In [52]:
def calcular_tokens(text,embedding_model):
    modelo = tiktoken.get_encoding(embedding_model)
    token = len(modelo.encode(text))
    return token

In [53]:
df['tokens totales'] = df['FOOD NAME'].apply(lambda x: calcular_tokens(x, 'cl100k_base'))

In [54]:
df

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,tokens totales
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs,3
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits,2
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables,6
...,...,...,...,...,...
901,White cabbage,Brassica oleracea L. var. capitata L. f. alba DC.,Vegetables,Cabbages,2
902,Romaine lettuce,Lactuca sativa L. var. longifolia,Vegetables,Leaf vegetables,4
903,dumplings,,,,3
904,muesli,,,,3


In [55]:
sum(df['tokens totales'])

2951

## Generando emebeddings

In [64]:
def get_embedding(text, model = "text-embedding-ada-002"):
  text = text.replace('\n', '')
  response = client.embeddings.create(
      input=text,
      model=model
  )
  return response.data[0].embedding



In [66]:
df["ada_embedding"] = df["FOOD NAME"].apply(lambda x: get_embedding(x))



In [67]:
df

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,tokens totales,ada_embedding
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2,"[0.00622481619939208, -0.010139460675418377, 0..."
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4,"[0.005467566661536694, -0.004983342252671719, ..."
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs,3,"[-0.0044629997573792934, 0.020091881975531578,..."
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits,2,"[-0.004675016738474369, -0.010036393068730831,..."
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables,6,"[0.013209687545895576, -0.019948789849877357, ..."
...,...,...,...,...,...,...
901,White cabbage,Brassica oleracea L. var. capitata L. f. alba DC.,Vegetables,Cabbages,2,"[0.010564504191279411, -0.01918477937579155, 0..."
902,Romaine lettuce,Lactuca sativa L. var. longifolia,Vegetables,Leaf vegetables,4,"[-0.0033427441958338022, -0.012237627059221268..."
903,dumplings,,,,3,"[0.00377720152027905, -0.013168713077902794, 0..."
904,muesli,,,,3,"[-0.01571560464799404, -0.015782881528139114, ..."


In [68]:
# exportar el dataframe
df.to_csv('generic-food-embeddings.csv', index=False)

## Creando datasets para visualizar

In [10]:
df = pd.read_csv('generic-food-embeddings_ada.csv')
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,tokens totales,ada_embedding
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2,"[0.00622481619939208, -0.010139460675418377, 0..."
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4,"[0.005467566661536694, -0.004983342252671719, ..."
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs,3,"[-0.0044629997573792934, 0.020091881975531578,..."
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits,2,"[-0.004675016738474369, -0.010036393068730831,..."
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables,6,"[0.013209687545895576, -0.019948789849877357, ..."


In [9]:
df_embeddgins = pd.DataFrame(list(df['ada_embedding']))
df_embeddgins.head()

Unnamed: 0,0
0,"[0.00622481619939208, -0.010139460675418377, 0..."
1,"[0.005467566661536694, -0.004983342252671719, ..."
2,"[-0.0044629997573792934, 0.020091881975531578,..."
3,"[-0.004675016738474369, -0.010036393068730831,..."
4,"[0.013209687545895576, -0.019948789849877357, ..."


In [11]:
df_embeddgins.to_csv('embeddings_food.tsv',sep='\t',index=False,header=False)

In [None]:
df[['FOOD NAME', 'GROUP', 'SUB GROUP']].to_csv('metadata_food.tsv',sep='\t',index=False,header=False)