In [2]:
import numpy as np
import openai
import pandas as pd

COMPLETIONS_MODEL = 'text-davinci-003'
EMBEDDING_MODEL = 'text-embedding-ada-002'
openai.api_key = '' # FIXME don't post your API key in a public repo ðŸ¤ª

# Load in the card database

As we will be using a knowledge base to aid the model in giving factual answers we will need to load said database in, I have opted to use a pandas dataframe for said knowledge base.

In [3]:
df = pd.read_csv('../data/embedding.csv')
df = df.set_index(['name'])
print(f'length: {len(df)}')
df.sample(2)

length: 2


Unnamed: 0_level_0,description
name,Unnamed: 1_level_1
Ancestral Empowerment,Ancestral Empowerment is a 'Ninja â€“ Attack Rea...
Alpha Rampage,Alpha Rampage is a 'Brute Action â€“ Attack' car...


# Create Embeddings

By creating an embedding of each card description within the database we can transform the linguistic data contained about each card and transform them into vectors within high dimensional space.

In [4]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result['data'][0]['embedding']

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.description) for idx, r in df.iterrows()
    }

# Compute embeddings

In [5]:
document_embedding = compute_doc_embeddings(df)

In [6]:
# An example embedding:
example_entry = list(document_embedding.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

Alpha Rampage : [-0.007986833341419697, -0.02000109851360321, -0.012000659480690956, -0.01263334695249796, -0.006371098570525646]... (1536 entries)


# Find the most similar document

Once the cards have been transformed into vectors we are able to calculate their relatedness with traditional distance functions such as Euclidean distance. In this case because the OpenAI embeddings are normalized, a Cosine similarity will be able to be performed slightly faster but will result in an identical ranking as Euclidean distance.

In [7]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.

    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities

In [8]:
order_document_sections_by_query_similarity("In the card game Flesh and Blood, what does the card Ancestral Empowerment do?", document_embedding)[:5]

[(0.8645458537314629, 'Ancestral Empowerment'),
 (0.7785378749394383, 'Alpha Rampage')]

# Add most relevant section to the query prompt

When constructing prompts for the system we can calculate the distance between the prompt and the cards in the knowledge base, and fetch the most relevant cards. By including the nth most relevant context we are able to aid the system in providing more factual answers.

In [19]:
SEPARATOR = "\n\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

In [30]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    # Add two most relevant contexts
    for _, section_index in most_relevant_document_sections[:2]:
        document_section = df.loc[section_index]

        chosen_sections.append(SEPARATOR + document_section.description.replace("\r\n", " "))
        chosen_sections_indexes.append(str(section_index))

    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [31]:
prompt = construct_prompt(
    'In the card game Flesh and Blood, what does the card Ancestral Empowerment do? '
    'Make sure to include information when appropriate for the class, card type, cost, pitch, defence, power, and any abilities.',
    document_embedding,
    df
)

print("===\n", prompt)

Selected 2 document sections:
Ancestral Empowerment
Alpha Rampage
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* Ancestral Empowerment is a 'Ninja â€“ Attack Reaction' card from the 'Welcome to Rathe' set. It costs 0, pitches for 1, defends for 3, has None power, and has the abilities; Target Ninja attack action card gains +1{p}.

Draw a card.

* Alpha Rampage is a 'Brute Action â€“ Attack' card from the 'Welcome to Rathe' set. It costs 3, pitches for 1, defends for 3, has 9 power, and has the abilities; **Rhinar Specialization**

As an additional cost to play Alpha Rampage, discard a random card.

When you attack with Alpha Rampage, **intimidate**.

 Q: In the card game Flesh and Blood, what does the card Ancestral Empowerment do? Make sure to include information when appropriate for the class, card type, cost, pitch, defence, power, and any abilities.
 A:


# Use the prompt with context

In [22]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [23]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )

    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [24]:
def answer_query_without_context(
    query: str,
    show_prompt: bool = False
) -> str:

    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=query,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [33]:
answer_query_without_context('In the card game Flesh and Blood, what does the card '
                             'Ancestral Empowerment do? '
                             'Make sure to include information when appropriate for the '
                             'class, card type, cost, pitch, defence, power, and any abilities.')

'Ancestral Empowerment is a Hero card from the Flesh and Blood trading card game. It is a Rare card of the Hero type, and it costs 4 Resources to play. It has a Pitch of 4, a Defence of 4, and a Power of 4.\n\nAncestral Empowerment has the following ability: "When Ancestral Empowerment enters play, you may search your deck for a card with the same name as a card in your graveyard and put it into your hand. Shuffle your deck afterwards." This ability allows you to search your deck for a card with the same name as a card in your graveyard and put it into your hand. This can be a great way to get back cards that have been discarded or destroyed.'

We can see that when the system is not given context it 'hallucinates' plausible sounding information, however, it is unfortunately completely fabricated.

In [34]:
answer_query_with_context('In the card game Flesh and Blood, what does the card '
                          'Ancestral Empowerment do? '
                          'Make sure to include information when appropriate for the '
                          'class, card type, cost, pitch, defence, power, and any abilities.',
                          df,
                          document_embedding)

Selected 2 document sections:
Ancestral Empowerment
Alpha Rampage


"Ancestral Empowerment is a 'Ninja â€“ Attack Reaction' card from the 'Welcome to Rathe' set. It costs 0, pitches for 1, defends for 3, has None power, and has the ability; Target Ninja attack action card gains +1{p}."

When provided with the relevant context we see that the 'hallucination' problem is greatly mitigated, however, we do see that the model is copying the information within the context verbatim, which is a major limitation of this approach.