In [19]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
from scipy import spatial
import tiktoken

In [2]:
df = pd.read_csv("/home/lizette/Documents/udacity_LLMs/project/books_data_embeddings/books_data_embeddings_0_1000.csv")

In [5]:
# Load API key from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key = OPENAI_API_KEY)
if not OPENAI_API_KEY:
    raise ValueError("OpenAI API key not found! Make sure it's in the .env file.")

EMBEDDING_MODEL = "text-embedding-3-small"
GPT_MODEL = "gpt-3.5-turbo"    

In [7]:
# Basic example
response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "I would like to read a book about two friends who live in napoli and grow up together"},
  ],
  max_tokens = 150
)
response

ChatCompletion(id='chatcmpl-B0Ez29Y97VFCpYqGU5sojbeQeYZcj', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='A wonderful book that fits your criteria is "My Brilliant Friend" by Elena Ferrante. It is the first book in Ferrante\'s "Neapolitan Novels" series. The story is set in Naples and revolves around the childhood and adolescence of two girls, Elena Greco and Raffaella "Lila" Cerullo, as they navigate the complexities of friendship, family, and the socio-political landscape of Italy in the 1950s and beyond.\n\nThe book beautifully captures the nuances of their friendship, the challenges they face, and the way their lives intertwine over the years. The vivid depiction of Naples and its culture adds depth to the narrative, making it a rich and engaging read.\n\nIf you\'re looking for a compelling and insightful', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739397296, model='gpt-4o-mi

In [1]:
## From openai cookbook

# search function from udacity course
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["Title"], row["description"], relatedness_fn(query_embedding, eval(row["embedding"])))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[2], reverse=True)
    titles, strings, relatednesses = zip(*strings_and_relatednesses)
    return titles[:top_n], strings[:top_n], relatednesses[:top_n]

def num_tokens(text: str, model: str = GPT_MODEL) -> int:
   """Return the number of tokens in a string."""
   encoding = tiktoken.encoding_for_model(model)
   return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    titles, strings, relatednesses = strings_ranked_by_relatedness(query, df, top_n = 5)
    introduction = 'Use the below articles to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for title, string in zip(titles,strings):
        next_article = f'\n\nBook title: {title}, Book description:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    print("Token budget:", token_budget)
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    print(response)
    response_message = response.choices[0].message.content
    return response_message


NameError: name 'pd' is not defined

In [20]:
ask("I would like to read a book about two friends who live in napoli and grow up together", df = df)

Token budget: 3596
ChatCompletion(id='chatcmpl-B0F28PDixVFiv4EZzokzBOmN9jHGy', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='I could not find an answer in the provided articles.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739397488, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=12, prompt_tokens=1124, total_tokens=1136, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))


'I could not find an answer in the provided articles.'

In [23]:
ask("I would like to read a feminist novel", df = df)

Token budget: 3596
ChatCompletion(id='chatcmpl-B0F3K6EZpukkqgeGtwgb4JM1VGy5S', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='You can consider reading "Herland" by Charlotte Perkins Gillman. It is a feminist novel that explores ideas about gender, motherhood, community, and sexuality in a science-fiction story.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1739397562, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=39, prompt_tokens=680, total_tokens=719, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))


'You can consider reading "Herland" by Charlotte Perkins Gillman. It is a feminist novel that explores ideas about gender, motherhood, community, and sexuality in a science-fiction story.'