In [1]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
from scipy import spatial
import tiktoken

In [2]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(BASE_DIR, "data")
DATA_PATH = os.path.join(DATA_DIR, "books_data_sample_embeddings.csv")

# Load the CSV file
df = pd.read_csv(DATA_PATH)

In [3]:
# Load API key from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key = OPENAI_API_KEY)
if not OPENAI_API_KEY:
    raise ValueError("OpenAI API key not found! Make sure it's in the .env file.")

EMBEDDING_MODEL = "text-embedding-3-small"
GPT_MODEL = "gpt-4o" 

## Basic query completion

In [4]:
# Basic example
response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "I would like to read a book about two friends who live in napoli and grow up together"},
  ],
  max_tokens = 150
)
response

ChatCompletion(id='chatcmpl-B5KJxzsQXARqiqw6uaEOzA9C8HkhG', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='You might be interested in "My Brilliant Friend" by Elena Ferrante. This novel is the first book in the "Neapolitan Novels" series and follows the lives of two childhood friends, Elena and Lila, as they grow up in a poor neighborhood in Naples, Italy. The story explores their complex friendship, personal ambitions, and the social issues of their times. It\'s a beautifully written exploration of female friendship, identity, and the environment they grew up in. If you’re looking for a deep character study set in Naples, this book would be a great choice!', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1740609453, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier='default', system_fingerprint='fp_7fcd609668', usage=CompletionUsage(completion_tokens=117, prompt_toke

In [5]:
response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "I would like to read a book about making money"},
  ],
  max_tokens = 150
)
response

ChatCompletion(id='chatcmpl-B5KJygayKeqriyn2vzZ1QddMjdJYk', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='There are many excellent books on making money, ranging from investment strategies to entrepreneurship. Here are some popular recommendations across different approaches:\n\n### Investment Strategies\n1. **"The Intelligent Investor" by Benjamin Graham** - A classic on value investing that teaches the fundamentals of sound investment principles.\n2. **"A Random Walk Down Wall Street" by Burton Malkiel** - Offers insights into various investment strategies and the importance of a diversified portfolio.\n3. **"Rich Dad Poor Dad" by Robert Kiyosaki** - This book contrasts two different perspectives on money management and investing, emphasizing financial education.\n\n### Personal Finance\n4. **"The Total Money Makeover" by Dave Ramsey** - Provides a step-by-step plan for financial fitness,', refusal=None, role='assistant', audio

## Custom query completion

In [6]:
## Reference: https://cookbook.openai.com/examples/question_answering_using_embeddings
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["Title"], row["description"], relatedness_fn(query_embedding, eval(row["embedding"])))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[2], reverse=True)
    titles, strings, relatednesses = zip(*strings_and_relatednesses)
    return titles[:top_n], strings[:top_n], relatednesses[:top_n]

def num_tokens(text: str, model: str = GPT_MODEL) -> int:
   """Return the number of tokens in a string."""
   encoding = tiktoken.encoding_for_model(model)
   return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    titles, strings, relatednesses = strings_ranked_by_relatedness(query, df, top_n = 100)
    introduction = 'Use the below articles to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for title, string in zip(titles,strings):
        next_article = f'\n\nBook title: {title}, Book description:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    print(response)
    response_message = response.choices[0].message.content
    return response_message


In [7]:
ask("I would like to read a book about two friends who live in napoli and grow up together", df = df, print_message = True)

Use the below articles to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."

Book title: Gift Giver, Book description:
"""
Ten-year-old Doris's gradual friendship with the new boy in her inner-city neighborhood, Amir, brings her into a closer understanding with her family even though it divides her from her old friends.
"""

Book title: Black and White, Book description:
"""
Two star high school basketball players, one black and one white, experience the justice system differently after committing a crime together and getting caught. An ALA Best Book for Young Adults. Reprint.
"""

Book title: Ass Whippings, Book description:
"""
Ass Whippings is a story for youth and adults about six boys who, are placed in two orphanages. Based on true events, the story takes the reader back in time to an era that will seem unbelievable to our contemporary generation of youth and adults. The reader will find the journey these boys take 

'I could not find an answer.'

In this example we see that our custom query completion is working as intended, identifying relevant source text and the language model reasoning that in the text there is nothing that matches the description.

In [8]:
ask("I would like to read a book about making money", df = df, print_message = True)

Use the below articles to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."

Book title: Building Wealth from the Ground Up, Book description:
"""
"Building Wealth from the Ground Up" is a strategic real-life approach to climbing out of the basement of debt to finding dollars to build your own financial empire. This book is able to catapult the reader to living their dreams of no financial worries, owning their own business, and retiring early. The consensus among the author's contemporaries is that Mikel Brown's counsel is like a check; all you have to do is cash it!
"""

Book title: The Law of Success - Vol I : Principles of Self-Mastery, Book description:
"""
For students of Napoleon Hill's philosophy for creating riches, "Think and Grow Rich" was only the beginning. This volume expands on the previous work's theme.
"""

Book title: From the Horse's Mouth, Book description:
"""
A professional animal communicator shares

'Based on the descriptions provided, "Building Wealth from the Ground Up" would be a suitable book for you. It offers a strategic approach to climbing out of debt and building a financial empire, which aligns with your interest in making money.'

In this example we see that our custom query completion is working as intended, identifying relevant source text and the model identifying a book that matches the description in the text.

## Conclusion

In this notebooks we learned about Question answering using embeddings-based search using an Amazon Books dataset. This tool could be used when having a catalog of books and wanting to search by description within the catalog.