In [135]:
import numpy as np
import openai
import pandas as pd

COMPLETIONS_MODEL = 'text-davinci-003'
EMBEDDING_MODEL = 'text-embedding-ada-002'
openai.api_key = '' # FIXME don't post your API key in a public repo 🤪

# Load in the card database

In [136]:
df = pd.read_csv('../data/embedding.csv')
df = df.set_index(['name'])
print(f'length: {len(df)}')
df.sample(5)

length: 5


Unnamed: 0_level_0,description
name,Unnamed: 1_level_1
Anothos,Anothos is a 'Guardian Weapon – Hammer (2H)' c...
Alpha Rampage,Alpha Rampage is a 'Brute Action – Attack' car...
Awakening Bellow (Red),Awakening Bellow (Red) is a 'Brute Action' car...
Ancestral Empowerment,Ancestral Empowerment is a 'Ninja – Attack Rea...
Awakening Bellow (Yellow),Awakening Bellow (Yellow) is a 'Brute Action' ...


# Create Embeddings

In [137]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result['data'][0]['embedding']

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.description) for idx, r in df.iterrows()
    }

# Compute embeddings

In [138]:
document_embedding = compute_doc_embeddings(df)

In [139]:
# An example embedding:
example_entry = list(document_embedding.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

Alpha Rampage : [-0.00880606472492218, -0.020165005698800087, -0.011521890759468079, -0.011501521803438663, -0.007970948703587055]... (1536 entries)


# Find the most similar document

In [140]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.

    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities

In [141]:
order_document_sections_by_query_similarity("In the card game Flesh and Blood, what does the card Ancestral Empowerment do?", document_embedding)[:5]

[(0.8664303053959408, 'Ancestral Empowerment'),
 (0.7900054343396351, 'Awakening Bellow (Red)'),
 (0.7827484555069621, 'Awakening Bellow (Yellow)'),
 (0.7820099972823491, 'Anothos'),
 (0.7780427876827074, 'Alpha Rampage')]

# Add most relevant section to the query prompt

In [142]:
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

In [143]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    # Add two most relevant contexts
    for _, section_index in most_relevant_document_sections[:2]:
        document_section = df.loc[section_index]

        chosen_sections.append(SEPARATOR + document_section.description.replace("\r\n", " "))
        chosen_sections_indexes.append(str(section_index))

    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [144]:
prompt = construct_prompt(
    'In the card game Flesh and Blood, what does the card Ancestral Empowerment do? '
    'Make sure to include information when appropriate for the class, card type, cost, pitch, defence, power, and any abilities.',
    document_embedding,
    df
)

print("===\n", prompt)

Selected 2 document sections:
Ancestral Empowerment
Awakening Bellow (Red)
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* Ancestral Empowerment is a 'Ninja – Attack Reaction' card from the 'Welcome to Rathe' set. It costs 0, pitches for 1, defends for 3, has None power, and has the abilities; Target Ninja attack action card gains +1{p}.  Draw a card.
* Awakening Bellow (Red) is a 'Brute Action' card from the 'Welcome to Rathe' set. It costs 1, pitches for 1, defends for 3, has None power, and has the abilities; The next Brute attack action card you play this turn gains +3{p}.  **Intimidate**  **Go again**

 Q: In the card game Flesh and Blood, what does the card Ancestral Empowerment do? Make sure to include information when appropriate for the class, card type, cost, pitch, defence, power, and any abilities.
 A:


# Use the prompt with context

In [145]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [146]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )

    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [148]:
answer_query_with_context('In the card game Flesh and Blood, what does the card '
                          'Awakening Bellow (Yellow) do? '
                          'Make sure to include information when appropriate for the '
                          'class, card type, cost, pitch, defence, power, and any abilities.',
                          df,
                          document_embedding)

Selected 2 document sections:
Awakening Bellow (Yellow)
Awakening Bellow (Red)


'Awakening Bellow (Yellow) is a Brute Action card from the Welcome to Rathe set. It costs 1, pitches for 2, defends for 3, has None power, and has the abilities; The next Brute attack action card you play this turn gains +2{p}. Intimidate and Go again.'