In [None]:
pip install numpy

In [None]:
pip install tenacity

In [None]:
pip install openai[embeddings]

In [None]:
pip install openai

In [None]:
pip install python-dotenv

In [None]:
import openai

In [None]:
from dotenv import dotenv_values

In [None]:
config = dotenv_values(".env")

In [None]:
openai.api_key = config["OPENAI_API_KEY"]

In [None]:
response = openai.embeddings.create(
 input="Your text string goes here",
 model="text-embedding-3-small"
)

In [None]:
print(response.data[0].embedding)

In [None]:
dataset_path = "./movie_plots.csv"

In [None]:
pip install tiktoken

In [None]:
import pandas as pd
import numpy as np
import pickle
import tiktoken

In [None]:
df = pd.read_csv(dataset_path);

In [None]:
movies = df[df["Origin/Ethnicity"]=="Bollywood"].sort_values("Release Year", ascending=False).head(1000)

In [None]:
movies

In [None]:
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [None]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("/n", " ");
    return openai.embeddings.create(
         input=text,
         model=model
    ).data[0].embedding

In [None]:
#This is path for movie embeddings
embedding_cache_path = "movie_embeddings.pkl"

In [None]:
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
    # Python code to write file using with 
with open(embedding_cache_path, "wb") as embedding_cache_file:
        pickle.dump(embedding_cache, embedding_cache_file)
        

def embedding_from_string(string, model = "text-embedding-3-small", embedding_cache = embedding_cache):
    if(string, model) not in embedding_cache.keys():
        embedding_cache[(string,model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPEN AI FOR STRING {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]
    

In [None]:
movie_plots = movies["Plot"].values

In [None]:
enc = tiktoken.encoding_for_model("text-embedding-3-small")
enc.encode("Hello world")

In [None]:
 total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])
    

In [None]:
total_tokens

In [None]:
cost = (0.00002 * total_tokens) / 1000

In [None]:
print("Estimated cost is : $",cost)

In [None]:
plot_embeddings = [embedding_from_string(plot, model = "text-embedding-3-small") for plot in movie_plots]

In [None]:
len(plot_embeddings)

In [None]:
pip install scipy

In [None]:
from scipy import spatial

def distances_from_embeddings(
    query_embedding: list,
    embeddings: list,
    distance_metric="cosine",
) -> list:
    """Return the distances between a query embedding and a list of embeddings."""
    distance_metrics = {
        "cosine": spatial.distance.cosine,
        "L1": spatial.distance.cityblock,
        "L2": spatial.distance.euclidean,
        "Linf": spatial.distance.chebyshev,
    }
    distances = [
        distance_metrics[distance_metric](query_embedding, embedding)
        for embedding in embeddings
    ]
    return distances

def indices_of_nearest_neighbors_from_distances(distances) -> np.ndarray:
    """Return a list of indices of nearest neighbors from a list of distances."""
    return np.argsort(distances)


In [None]:
def print_recommendatons_from_strings(strings, index_of_source_string, k_nearest_neighbours=3, model="text-embedding-3-small", movies=movies):
    embeddings = [embedding_from_string(string) for string in strings]
    query_embedding = embeddings[index_of_source_string]
    distances = distances_from_embeddings(query_embedding, embeddings)
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    query_string = strings[index_of_source_string]
    count = 0
    
    for i in indices_of_nearest_neighbors:
        if(strings[i] == query_string):
            continue
        if count >= 2:
            break
        count += 1
        print("::::::::::::::NEW MATCH :::::::::::::::::::")
        print("Found closest match: ",)
        print("The distance is ", distances[i])
        print(strings[i])
    print(indices_of_nearest_neighbors)
    

In [None]:
print_recommendatons_from_strings(movie_plots, 0)