In [11]:
import openai
from dotenv import dotenv_values
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
from nomic import atlas

In [2]:
config = dotenv_values(".env")
openai.api_key = config["API"]

In [3]:
#import data set from https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots
dataset_path = "./wiki_movie_plots.csv"
source_df = pd.read_csv(dataset_path)

In [4]:
movies = source_df[source_df["Origin/Ethnicity"]=="American"].sort_values("Release Year", ascending=False).head(5000)

In [5]:
#get embedding function
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):
    text=text.replace("\n", " ")
    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [6]:
#establishing cache of embeddings to reduce cost and time
embedding_cache_path = "movie_embeddings_cache.pkl"
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

In [54]:
embedding_cache

{}

In [7]:
#define a function to retrieve embeddings from the cache if present, 
#otherwise request via API

def embedding_from_cache_or_API(
    string, 
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
):
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)]=get_embedding(string, model)
        print("I have just got embeddings from openai for you!")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [8]:
# generate embeddings for movie plots
plot_embeddings = [embedding_from_cache_or_API(plot) for plot in movies["Plot"].values]

In [None]:
plot_embeddings[0]

In [None]:
movies[["Title", "Genre"]].to_dict(orient="records")

In [None]:
#atlas part with visualising results

atlas_map = atlas.map_embeddings(
    embeddings=np.array(plot_embeddings),
    data=movies[["Title", "Genre"]].to_dict(orient="records")
)