# Uso de modelos de embeddings de OpenAI

## Instalación y carga de librerías

In [9]:
# !pip install openai
# !pip install tiktoken

In [10]:
from openai import OpenAI
from getpass import getpass
import os
import pandas as pd

In [11]:
# OPENAI_API_KEY = getpass('Enter the secret value: ')
# openai.api_key = OPENAI_API_KEY

## Cargar dataset

In [12]:
df = pd.read_csv('../Platzi_codes/emydb_v11-notebook_5_embeddings_openai-generic-food.csv')

In [13]:
df.head(2)

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages


In [20]:
df


Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,total_tokens,ada_embeddings
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2,"[0.01930958963930607, -0.012144645676016808, -..."
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4,"[0.005006470251828432, -0.009188915602862835, ..."
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs,3,"[-0.00378024997189641, -0.026939626783132553, ..."
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits,2,"[0.0194796584546566, 0.004785444121807814, -0...."
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables,6,"[0.015319375321269035, -0.029613390564918518, ..."
...,...,...,...,...,...,...
901,White cabbage,Brassica oleracea L. var. capitata L. f. alba DC.,Vegetables,Cabbages,2,"[-0.019671302288770676, 0.005599504336714745, ..."
902,Romaine lettuce,Lactuca sativa L. var. longifolia,Vegetables,Leaf vegetables,4,"[-0.006982708349823952, 0.01654770039021969, 0..."
903,dumplings,,,,3,"[-0.006745264865458012, -0.021715238690376282,..."
904,muesli,,,,3,"[-0.02160456031560898, 0.0009357034577988088, ..."


In [31]:
display(type(df["ada_embeddings"][0]), len(df["ada_embeddings"][0]), df["ada_embeddings"][0])

list

1536

[0.01930958963930607,
 -0.012144645676016808,
 -0.0031023810151964426,
 -0.00016730756033211946,
 -0.0194685161113739,
 -0.022964904084801674,
 0.007297382690012455,
 0.018104396760463715,
 0.013435925357043743,
 -0.02259407378733158,
 -0.0325005017220974,
 0.014157717116177082,
 0.019230127334594727,
 -0.0525517500936985,
 0.008105260320007801,
 0.02570638805627823,
 -0.04908185079693794,
 -0.004933348391205072,
 -0.003380502574145794,
 -0.030460944399237633,
 -0.016144299879670143,
 0.02850084938108921,
 0.0018293121829628944,
 -0.02064722217619419,
 0.025838827714323997,
 -0.007628479972481728,
 0.017428956925868988,
 -0.011204330250620842,
 0.027785679325461388,
 -0.009807099588215351,
 0.013084962032735348,
 -0.04004951938986778,
 0.007866869680583477,
 0.002400454832240939,
 0.008535685949027538,
 0.04314858838915825,
 -0.03774508461356163,
 0.019428784027695656,
 0.02181268483400345,
 0.016660811379551888,
 0.021799441426992416,
 0.0001641000562813133,
 0.02606397308409214,
 0.0

## Evaluar cantidad de tokens a procesar

In [14]:
# contabilizar la cantidad de tokens que se enviarían a OpenAI

import tiktoken

def num_tokens_from_string(text, encodig_name):
    encoding = tiktoken.get_encoding(encodig_name)
    num_tokens = len(encoding.encode(text))
    return num_tokens


In [15]:
df['total_tokens'] = df['FOOD NAME'].apply(lambda x : num_tokens_from_string(x,'cl100k_base') )

In [16]:
sum(df['total_tokens'])

2947

## Generando emebeddings

__Esta es la estructura respuesta del modelo de embedding__

```markdown
{
  "object": "list",
  "data": [
    {
      "object": "embedding",
      "index": 0,
      "embedding": [
        -0.006929283495992422,
        -0.005336422007530928,
        -4.547132266452536e-05,
        -0.024047505110502243
      ],
    }
  ],
  "model": "text-embedding-3-small",
  "usage": {
    "prompt_tokens": 5,
    "total_tokens": 5
  }
}
```

Estas lineas fue la versión extendida

```python
client = OpenAI()
def get_embedding(in_text, in_model="text-embedding-3-small"):
    temp_text = in_text.replace("\n", " ") # no tengo claro por qué esta linea
    response = client.embeddings.create(input=[temp_text], model=in_model)
    embedding_response = response.data[0].embedding
    return embedding_response
```

En el siguiente script hagola reducción

In [17]:
client = OpenAI()
def get_embedding(in_text, in_model="text-embedding-3-small"):
    in_text = in_text.replace("\n", " ")
    return client.embeddings.create(input=[in_text], model=in_model).data[0].embedding

In [18]:
df['ada_embeddings'] = df['FOOD NAME'].apply(lambda x :get_embedding(x,in_model="text-embedding-3-small"))

In [19]:
df.head(2)

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,total_tokens,ada_embeddings
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2,"[0.01930958963930607, -0.012144645676016808, -..."
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4,"[0.005006470251828432, -0.009188915602862835, ..."


In [43]:
'''
INFORMATIVO
'''
# display(type(df['ada_embeddings'][0]))
# display(df['ada_embeddings'][0])
# display(len(df['ada_embeddings'][0]))

## Creando datasets para visualizar

Como la data se va a llevar a ~~XXXXembedding~~ se debe formatear

In [None]:
'''
INFORMATIVO
'''

# df['ada_embeddings'].head(5)

In [None]:
'''
INFORMATIVO
'''
# type(list(df['ada_embeddings']))

In [None]:
'''
INFORMATIVO
'''

#list(df['ada_embeddings'])

In [None]:
'''
INFORMATIVO
'''

# df['ada_embeddings'].apply(type).value_counts()

In [47]:
# Paso 1
df_embeddings = pd.DataFrame(list(df['ada_embeddings']))

In [None]:
'''
INFORMATIVO
'''

df_embeddings.head(5)

In [54]:
# Paso 2: Guardar los embeddings (df['ada_embeddings']) en formato tsv
df_embeddings.to_csv('embedding_food.tsv',sep='\t',index=False, header=False)

In [55]:
# Creación de un dataframe para la metadata y labels
df[['FOOD NAME','GROUP','SUB GROUP']].to_csv('labels_food.tsv',sep='\t',index=False, header=True)

\# Ir a 
[Embedding Projector](https://projector.tensorflow.org/)

Allí cargar los embedding emydb_v11-notebook_5_embeddings_openai_embedding_food y los lables emydb_v11-notebook_5_embeddings_openai_labels_food