# Embeddings

In [28]:
import openai
import tiktoken
from dotenv import dotenv_values

import pandas as pd
import numpy as np

from tenacity import retry, wait_random_exponential, stop_after_attempt

import pickle

In [2]:
config = dotenv_values(".env")

openai.api_key = config["OPENAI_API_KEY"]

In [3]:
res = openai.Embedding.create(
    input="candy canes",
    model="text-embedding-ada-002"

)

In [6]:
res["data"][0]["embedding"]

[-0.01746266707777977,
 -0.023939916864037514,
 0.004187542479485273,
 -0.03715350851416588,
 -0.025831274688243866,
 0.0006748485029675066,
 -0.018175164237618446,
 -0.007824518717825413,
 -0.00906167272478342,
 -0.029847169294953346,
 0.020662428811192513,
 0.018978342413902283,
 -0.03808623179793358,
 0.004411007277667522,
 -0.0008137045660987496,
 0.005767991300672293,
 0.03904486447572708,
 0.0031641367822885513,
 -0.008187244646251202,
 -0.019936976954340935,
 -0.01642630621790886,
 -0.005240095313638449,
 -0.005246572662144899,
 -0.016387443989515305,
 0.007662586867809296,
 -0.01607653498649597,
 0.021154699847102165,
 -0.006898271385580301,
 0.003568965010344982,
 0.006775203626602888,
 0.014975402504205704,
 -0.013291317969560623,
 -0.03782714158296585,
 -0.02530013956129551,
 -0.012021776288747787,
 -0.01764402911067009,
 -0.0070213391445577145,
 -0.01652994193136692,
 0.013835406862199306,
 0.0026459568180143833,
 -0.0011424249969422817,
 -0.00319814239628613,
 -0.006386568

## Embeddings - Movie Plotting w/ Atlas

In [5]:
dataset_path = "./wiki_movie_plots_deduped.csv"

df = pd.read_csv(dataset_path)

In [40]:
movies = df[df["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(500)

In [55]:
movie_plots = movies["Plot"].values

In [56]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):
    # replace new lines, which can negatively impact performance
    text = text.replace("\n", " ")
    
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]
    

In [57]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [58]:
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])

total_tokens

393990

In [59]:
cost = total_tokens * (0.0004 / 1000)

print(f"Estimated Cost: ${cost:.2f}")

Estimated Cost: $0.16


In [60]:
get_embedding("Hello world!")

[0.006552584003657103,
 0.0037474751006811857,
 -0.01162879541516304,
 -0.02676319144666195,
 -0.012401535175740719,
 -0.001383707276545465,
 -0.013369030319154263,
 0.009360834024846554,
 -0.006332698743790388,
 -0.02950233407318592,
 0.02392352931201458,
 0.0030077178962528706,
 -0.02325759083032608,
 -0.009191208519041538,
 0.006715927738696337,
 0.001144189271144569,
 0.026059558615088463,
 -0.018319591879844666,
 0.008908499032258987,
 0.009656108915805817,
 -0.013004649430513382,
 -0.0010413143318146467,
 0.00727506447583437,
 0.008789132349193096,
 -0.012677961960434914,
 0.0036940742284059525,
 0.005415462423115969,
 -0.01722644828259945,
 0.036211978644132614,
 -0.02671293169260025,
 0.012621420435607433,
 -0.008569247089326382,
 -0.007645728532224894,
 -0.012608855962753296,
 0.007092874031513929,
 -0.014122923836112022,
 0.004887737799435854,
 -0.01365802250802517,
 0.018156249076128006,
 -0.014399350620806217,
 0.008324231952428818,
 0.005984023213386536,
 0.005412321537733

In [67]:
# establish a cache of embeddings to avoid computing
# cach is a dict of tuples (text, model) -> embedding, save as a pickle file

# set path to embedding cach
embedding_cache_path = "movie_embeddings_demo.pkl"

# load the cache if it exists and save a copy to the disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)


# define a function to retrieve embeddings from the cache if present
# if not present, request via the API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache,
):
    """Return embedding of given string. Using a cache to avoid computing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]
            

In [68]:
embedding_from_string("Chicken is friend!!!")

GOT EMBEDDING FROM OPENAI FOR Chicken is friend!!!


[0.0003346744051668793,
 -0.016962656751275063,
 -0.012188734486699104,
 -0.026307353749871254,
 0.0056690312922000885,
 -0.014943896792829037,
 -0.013115586712956429,
 -0.0304972305893898,
 -0.008830484934151173,
 -0.03367137908935547,
 0.0065768384374678135,
 0.005484930705279112,
 -2.9658462153747678e-05,
 0.0025456680450588465,
 -0.018486248329281807,
 0.01352187804877758,
 0.038343727588653564,
 -0.003199542872607708,
 0.02265073172748089,
 -0.02681521698832512,
 -0.03643923997879028,
 -0.001972733996808529,
 -0.008709866553544998,
 -0.02287927083671093,
 -0.008087733760476112,
 0.03336666151881218,
 0.012975923717021942,
 -0.031360600143671036,
 0.014423335902392864,
 -0.00935104489326477,
 0.030522624030709267,
 0.0037169293500483036,
 0.001836245646700263,
 0.010849243961274624,
 -0.0029995713848620653,
 0.007287847809493542,
 0.0038311986718326807,
 -0.002044152468442917,
 0.015642208978533745,
 0.002377438126131892,
 0.03674395754933357,
 -0.013674236834049225,
 -0.0024012443

In [69]:
plot_embeddings = [embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]

GOT EMBEDDING FROM OPENAI FOR In 1954 London, reno
GOT EMBEDDING FROM OPENAI FOR Eighteen-year-old Ma
GOT EMBEDDING FROM OPENAI FOR In a prologue, busin
GOT EMBEDDING FROM OPENAI FOR Anne (Diane Lane) is
GOT EMBEDDING FROM OPENAI FOR During the Iraq War,
GOT EMBEDDING FROM OPENAI FOR A contemporary tale 
GOT EMBEDDING FROM OPENAI FOR Recently fired from 
GOT EMBEDDING FROM OPENAI FOR The movie opens with
GOT EMBEDDING FROM OPENAI FOR Mary (Debra Winger) 
GOT EMBEDDING FROM OPENAI FOR In 2014, Peter Quill
GOT EMBEDDING FROM OPENAI FOR A young street magic
GOT EMBEDDING FROM OPENAI FOR Having made a career
GOT EMBEDDING FROM OPENAI FOR When her car breaks 
GOT EMBEDDING FROM OPENAI FOR Mikael (Oscar Isaac)
GOT EMBEDDING FROM OPENAI FOR Julia Banks is being
GOT EMBEDDING FROM OPENAI FOR This documentary fol
GOT EMBEDDING FROM OPENAI FOR Best friends Mindy (
GOT EMBEDDING FROM OPENAI FOR Dash (voiced by Schw
GOT EMBEDDING FROM OPENAI FOR Ireland, 1905: Percy
GOT EMBEDDING FROM OPENAI FOR T

In [71]:
len(plot_embeddings)

500