In [None]:
import openai
from dotenv import dotenv_values

config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]

In [None]:
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
import tiktoken

## Load The Movie Data

In [None]:
dataset_path = "./movie_plots.csv"
df = pd.read_csv(dataset_path)

In [None]:
# Narrow our data set to 5000 recent American movies (to save money)
movies = (
    df[df["Origin/Ethnicity"] == "American"]
    .sort_values("Release Year", ascending=False)
    .head(5000)
)

In [None]:
# Extract the movie plots into a list
movie_plots = movies["Plot"].values

## Generating The Embeddings

In [None]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    return openai.embeddings.create(input=text, model=model).data[0].embedding

In [None]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [None]:
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])

In [None]:
total_tokens
cost = total_tokens * (0.0004 / 1000)
print(f"Estimated cost ${cost:.2f}")

In [None]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "movie_embeddings_cache2.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)


# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string, model="text-embedding-ada-002", embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [None]:
# This line actaully generates the embeddings
plot_embeddings = [
    embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots
]

## Plot The Embeddings Using Atlas

In [None]:
data = movies[["Title", "Genre"]].to_dict("records")

In [None]:
from nomic import atlas

In [None]:
project = atlas.map_embeddings(embeddings=np.array(plot_embeddings), data=data)

## Reccommending Movies By Plot

In [None]:
from typing import List
from scipy import spatial


def distances_from_embeddings(
    query_embedding: List[float],
    embeddings: List[List[float]],
    distance_metric="cosine",
) -> List[List]:
    """Return the distances between a query embedding and a list of embeddings."""
    distance_metrics = {
        "cosine": spatial.distance.cosine,
        "L1": spatial.distance.cityblock,
        "L2": spatial.distance.euclidean,
        "Linf": spatial.distance.chebyshev,
    }
    distances = [
        distance_metrics[distance_metric](query_embedding, embedding)
        for embedding in embeddings
    ]
    return distances


def indices_of_nearest_neighbors_from_distances(distances) -> np.ndarray:
    """Return a list of indices of nearest neighbors from a list of distances."""
    return np.argsort(distances)

In [None]:
def print_recommendations_from_strings(
    strings,
    index_of_source_string,
    k_nearest_neighbors=3,
    model="text-embedding-ada-002",
):
    # Get all of the embeddings
    embeddings = [embedding_from_string(string) for string in strings]
    # get embedding for our specific query string
    query_embedding = embeddings[index_of_source_string]
    # get distances between our embedding and all other embeddings
    distances = distances_from_embeddings(query_embedding, embeddings)
    # get indices of the nearest neighbors
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(
        distances
    )

    query_string = strings[index_of_source_string]
    match_count = 0
    for i in indices_of_nearest_neighbors:
        if query_string == strings[i]:
            continue
        if match_count >= k_nearest_neighbors:
            break
        match_count += 1
        print(f"Found {match_count} closest match: ")
        print(f"Distance of: {distances[i]} ")
        print(strings[i])

In [None]:
print_recommendations_from_strings(movie_plots, 2)