In [42]:
import openai
from dotenv import dotenv_values
config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]

In [43]:
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
import tiktoken

In [44]:
dataset_path = "./movie_plots.csv"
df = pd.read_csv(dataset_path)

In [45]:
# Narrow our data set to 5000 recent American movies (to save money)
movies = df[df["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(200)

In [46]:
# Extract the movie plots into a list
movie_plots = movies["Plot"].values

In [47]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [48]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [49]:
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])

In [50]:
total_tokens
cost = total_tokens * (.0004 / 1000)
print(f"Estimated cost ${cost:.2f}")

Estimated cost $0.06


In [51]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "movie_embeddings_cache2.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [52]:
# This line actaully generates the embeddings
plot_embeddings = [embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]

GOT EMBEDDING FROM OPENAI FOR In 2029, no mutants 
GOT EMBEDDING FROM OPENAI FOR Nursing assistant Ru
GOT EMBEDDING FROM OPENAI FOR In the near future, 
GOT EMBEDDING FROM OPENAI FOR Casey, an American l
GOT EMBEDDING FROM OPENAI FOR Bodi (Luke Wilson) i
GOT EMBEDDING FROM OPENAI FOR African-American pho
GOT EMBEDDING FROM OPENAI FOR In the year 1347 in 
GOT EMBEDDING FROM OPENAI FOR Mordred, an iron-fis
GOT EMBEDDING FROM OPENAI FOR The movie starts wit
GOT EMBEDDING FROM OPENAI FOR In "9 months B.C.", 
GOT EMBEDDING FROM OPENAI FOR Derek Cho, the film'
GOT EMBEDDING FROM OPENAI FOR In the town of Ebbin
GOT EMBEDDING FROM OPENAI FOR The story centers ar
GOT EMBEDDING FROM OPENAI FOR After finally becomi
GOT EMBEDDING FROM OPENAI FOR In 1934, famous Belg
GOT EMBEDDING FROM OPENAI FOR Christine "Lady Bird
GOT EMBEDDING FROM OPENAI FOR In 2003, Larry "Doc"
GOT EMBEDDING FROM OPENAI FOR Amy (Mila Kunis) is 
GOT EMBEDDING FROM OPENAI FOR Two years after the 
GOT EMBEDDING FROM OPENAI FOR T

In [55]:
data = movies[["Title", "Genre"]].to_dict("records")

In [56]:
from nomic import atlas

In [57]:
project = atlas.map_embeddings(
    embeddings=np.array(plot_embeddings),
    data=data
)

[32m2023-07-01 16:31:44.006[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m1100[0m - [1mCreating project `lyrical-silly` in organization `markova.maryna`[0m
[32m2023-07-01 16:31:45.817[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m100[0m - [1mUploading embeddings to Atlas.[0m
1it [00:02,  2.17s/it]
[32m2023-07-01 16:31:48.037[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1722[0m - [1mUpload succeeded.[0m
[32m2023-07-01 16:31:48.042[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m119[0m - [1mEmbedding upload succeeded.[0m
[32m2023-07-01 16:31:49.537[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1427[0m - [1mCreated map `lyrical-silly` in project `lyrical-silly`: https://atlas.nomic.ai/map/96afad09-817d-4b00-b6b8-a5787d778246/05a622f9-4461-4e4f-8381-8470e726c2e7[0m
[32m2023-07-01 16:31:49.538[0m | [1mINFO    [0m | [36mnomic

In [59]:
pip install matplotlib


Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlib
  Downloading matplotlib-3.7.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.4/300.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.40.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0

In [61]:
pip install plotly


Defaulting to user installation because normal site-packages is not writeable
Collecting plotly
  Downloading plotly-5.15.0-py2.py3-none-any.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: plotly
Successfully installed plotly-5.15.0
Note: you may need to restart the kernel to use updated packages.


In [63]:
pip install scipy

Defaulting to user installation because normal site-packages is not writeable
Collecting scipy
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.5/34.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: scipy
Successfully installed scipy-1.10.1
Note: you may need to restart the kernel to use updated packages.


In [66]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.3.1-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.3.1 scikit-learn-1.3.0 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [69]:
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances

In [76]:
def print_recommendations_from_strings(
    strings,
    index_of_source_string,
    k_nearest_neighbors=3,
    model="text-embedding-ada-002"
) :
    # Get all of the embeddings
    embeddings = [embedding_from_string(string) for string in strings]
    # Get embedding for out specific query string
    query_embedding = embeddings[index_of_source_string]
    # Get distances between our embedding and all other embeddings
    distances = distances_from_embeddings(query_embedding, embeddings)
    # Get indices of the nearest neighbors
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    
    query_string = strings[index_of_source_string]
    match_count = 0
    for i in indices_of_nearest_neighbors:
        if query_string == strings[i]:
            continue
        if match_count >= k_nearest_neighbors:
            break
        match_count += 1
        print(f"Found {match_count} closest match: ")
        print(f"Distance of: {distances[i]}")
        print(strings[i])
            

In [77]:
print_recommendations_from_strings(movie_plots, 2)

Found 1 closest match: 
Distance of: 0.13664061546949713
In the near future, the unmanned Pilgrim 7 space probe returns from Mars to Earth orbit with soil samples potentially containing evidence of extraterrestrial life. The probe is captured and its samples retrieved by the International Space Station and its six-member crew. Exobiologist Hugh Derry, who is paralyzed from the waist down, revives a dormant cell from the sample, which quickly grows into a multi-celled organism that American school children name "Calvin". Hugh realizes that Calvin's cells can change their specialisation, acting as muscle, sensor, and neuron cells all at once.
An accident in the lab causes Calvin to become dormant; Hugh attempts to revive Calvin with electric shocks, but Calvin immediately becomes hostile and attacks Hugh, crushing his hand. While Hugh lies unconscious from Calvin's attack, Calvin uses Hugh's electric shock tool to escape its enclosure; now free in the laboratory, Calvin devours a lab ra