# **Gloomhaven: Jaws of the Lion chatbot**

A RAG chatbot for the boardgame Gloomhaven: Jaws of the Lion.

### Setting up environment

In [1]:
import fitz
import os
import numpy as np
import json
import openai

### Extracting text from a PDF file
$$\text{(!!!!!Should be in separate .py file!!!!!)}$$

In [15]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    Args:
    pdf_path (str): Path to the PDF file.

    Returns:
    str: Extracted text from the PDF.
    """
    # Open the PDF file
    mypdf = fitz.open(pdf_path)
    all_text = ""  # Initialize an empty string to store the extracted text
    
    # Iterate through each page in the PDF
    for page in mypdf:
        # Extract text from the current page and add spacing
        all_text += page.get_text("text") + " "

    # Return the extracted text, stripped of leading/trailing whitespace
    return all_text.strip()

In [16]:
# Define the path to the PDF file
pdf_path = "data/gloomhaven_jotl_glossary.pdf"

# Extract text from the PDF file
extracted_text = extract_text_from_pdf(pdf_path)

# Print the first 500 characters of the extracted text
print(extracted_text[:500])

Most of this document consists of a Glossary of game terms 
in alphabetical order for ease of reference (p. 2-21).  
The last section contains six appendices: 
Component Integration (p. 30):  
Explains what components  
of this game can be used in the  
larger Gloomhaven game. 
Components List (p. 28):  
An accounting of all the 
components in the box.  
Treasure Index (p. 31)
 
An index referenced any time 
a numbered treasure tile in a 
scenario is looted. 
“Misplaced Goods” Index (p. 31):  


### Loading environment variables and initializing OpenAI API client

In [17]:
from dotenv import load_dotenv

# Load the environment variables from the .env file
load_dotenv()

# Initialize the OpenAI API client
client = openai.OpenAI()

### Create Sentence-Level Embeddings
$$\text{(!!!!!Should be in separate .py file!!!!!)}$$

In [18]:
def get_embedding(text, model="text-embedding-3-small"):
    """
    Creates an embedding for the given text using OpenAI.

    Args:
    text (str): Input text.
    model (str): Embedding model name.

    Returns:
    np.ndarray: The embedding vector.
    """
    response = client.embeddings.create(model=model, input=text)
    return np.array(response.data[0].embedding)

In [19]:
# Splitting text into sentences (basic split)
sentences = extracted_text.split(". ")

# Generate embeddings for each sentence
embeddings = [get_embedding(sentence) for sentence in sentences]

print(f"Generated {len(embeddings)} sentence embeddings.")

Generated 686 sentence embeddings.


### Calculate Similarity Differences
$$\text{(!!!!!Should be in separate .py file!!!!!)}$$

In [20]:
def cosine_similarity(vec1, vec2):
    """
    Computes cosine similarity between two vectors.

    Args:
    vec1 (np.ndarray): First vector.
    vec2 (np.ndarray): Second vector.

    Returns:
    float: Cosine similarity.
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [21]:
# Compute similarity between consecutive sentences
similarities = [cosine_similarity(embeddings[i], embeddings[i + 1]) for i in range(len(embeddings) - 1)]

### Implement Semantic Chunking
$$\text{(!!!!!Should be in separate .py file!!!!!)}$$

In [22]:
def compute_breakpoints(similarities, method="percentile", threshold=90):
    """
    Computes chunking breakpoints based on similarity drops.

    Args:
    similarities (List[float]): List of similarity scores between sentences.
    method (str): 'percentile', 'standard_deviation', or 'interquartile'.
    threshold (float): Threshold value (percentile for 'percentile', std devs for 'standard_deviation').

    Returns:
    List[int]: Indices where chunk splits should occur.
    """
    # Determine the threshold value based on the selected method
    if method == "percentile":
        # Calculate the Xth percentile of the similarity scores
        threshold_value = np.percentile(similarities, threshold)
    elif method == "standard_deviation":
        # Calculate the mean and standard deviation of the similarity scores
        mean = np.mean(similarities)
        std_dev = np.std(similarities)
        # Set the threshold value to mean minus X standard deviations
        threshold_value = mean - (threshold * std_dev)
    elif method == "interquartile":
        # Calculate the first and third quartiles (Q1 and Q3)
        q1, q3 = np.percentile(similarities, [25, 75])
        # Set the threshold value using the IQR rule for outliers
        threshold_value = q1 - 1.5 * (q3 - q1)
    else:
        # Raise an error if an invalid method is provided
        raise ValueError("Invalid method. Choose 'percentile', 'standard_deviation', or 'interquartile'.")

    # Identify indices where similarity drops below the threshold value
    return [i for i, sim in enumerate(similarities) if sim < threshold_value]

In [23]:
# Compute breakpoints using the percentile method with a threshold of 90
breakpoints = compute_breakpoints(similarities, method="percentile", threshold=90)

### Split Text Into Semantic Chunks
$$\text{(!!!!!Should be in separate .py file!!!!!)}$$

In [24]:
def split_into_chunks(sentences, breakpoints):
    """
    Splits sentences into semantic chunks.

    Args:
    sentences (List[str]): List of sentences.
    breakpoints (List[int]): Indices where chunking should occur.

    Returns:
    List[str]: List of text chunks.
    """
    chunks = []  # Initialize an empty list to store the chunks
    start = 0  # Initialize the start index

    # Iterate through each breakpoint to create chunks
    for bp in breakpoints:
        # Append the chunk of sentences from start to the current breakpoint
        chunks.append(". ".join(sentences[start:bp + 1]) + ".")
        start = bp + 1  # Update the start index to the next sentence after the breakpoint

    # Append the remaining sentences as the last chunk
    chunks.append(". ".join(sentences[start:]))
    return chunks  # Return the list of chunks

In [25]:
# Create chunks using the split_into_chunks function
text_chunks = split_into_chunks(sentences, breakpoints)

# Print the number of chunks created
print(f"Number of semantic chunks: {len(text_chunks)}")

# Print the first chunk to verify the result
print("\nFirst text chunk:")
print(text_chunks[0])

Number of semantic chunks: 617

First text chunk:
Most of this document consists of a Glossary of game terms 
in alphabetical order for ease of reference (p.


### Create Embeddings for Semantic Chunks
$$\text{(!!!!!Should be in separate .py file!!!!!)}$$

In [26]:
def create_embeddings(text_chunks):
    """
    Creates embeddings for each text chunk.

    Args:
    text_chunks (List[str]): List of text chunks.

    Returns:
    List[np.ndarray]: List of embedding vectors.
    """
    # Generate embeddings for each text chunk using the get_embedding function
    return [get_embedding(chunk) for chunk in text_chunks]

In [27]:
# Create chunk embeddings using the create_embeddings function
chunk_embeddings = create_embeddings(text_chunks)

### Semantic Search
$$\text{(!!!!!Should be in separate .py file!!!!!)}$$

In [28]:
def semantic_search(query, text_chunks, chunk_embeddings, k=5):
    """
    Finds the most relevant text chunks for a query.

    Args:
    query (str): Search query.
    text_chunks (List[str]): List of text chunks.
    chunk_embeddings (List[np.ndarray]): List of chunk embeddings.
    k (int): Number of top results to return.

    Returns:
    List[str]: Top-k relevant chunks.
    """
    # Generate an embedding for the query
    query_embedding = get_embedding(query)
    
    # Calculate cosine similarity between the query embedding and each chunk embedding
    similarities = [cosine_similarity(query_embedding, emb) for emb in chunk_embeddings]
    
    # Get the indices of the top-k most similar chunks
    top_indices = np.argsort(similarities)[-k:][::-1]
    
    # Return the top-k most relevant text chunks
    return [text_chunks[i] for i in top_indices]

In [30]:
# Load the validation data from a JSON file
with open('data/validation_data.json') as f:
    data = json.load(f)

# Extract the first query from the validation data
query = data[0]['question']

# Get top 2 relevant chunks
top_chunks = semantic_search(query, text_chunks, chunk_embeddings, k=2)

# Print the query
print(f"Query: {query}")

# Print the top 2 most relevant text chunks
for i, chunk in enumerate(top_chunks):
    print(f"Context {i+1}:\n{chunk}\n{'='*40}")

Query: What determines the turn order in Gloomhaven?
Context 1:
 
Two figures cannot occupy the same hex.
 Order of Round
Every round of a scenario follows the same order: 
Card Selection
Ordering of Initiative
Character and Monster Turns
End of Round
 Ordering of Initiative
Ordering of Initiative is the second part of every round.
Context 2:

Unless the character is exhausted or declares a long 
rest, two cards are selected at the start of every round, 
with one chosen to be the initiative card. When it is a 
character’s turn, based on the initiative value of their 
initiative card, both cards are played, one after the other 
in either order, with one being used for its top action and 
the other being used for its bottom action.


### Generate a response based on retrieved chunks
$$\text{(!!!!!Should be in separate .py file!!!!!)}$$

In [31]:
# Define the system prompt for the AI assistant
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

def generate_response(system_prompt, user_message, model="gpt-4o-mini-2024-07-18"):
    """
    Generates a response from the AI model based on the system prompt and user message.

    Args:
    system_prompt (str): The system prompt to guide the AI's behavior.
    user_message (str): The user's message or query.
    model (str): The model to be used for generating the response. Default is "gpt-4o-mini-2024-07-18".

    Returns:
    dict: The response from the AI model.
    """
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]
    )
    return response

# Create the user prompt based on the top chunks
user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
user_prompt = f"{user_prompt}\nQuestion: {query}"

# Generate AI response
ai_response = generate_response(system_prompt, user_prompt)

In [32]:
ai_response.choices[0].message.content

'The turn order in Gloomhaven is determined by the initiative value of the initiative card selected at the start of the round. Each character selects two cards, one of which is chosen as the initiative card, and the order of play is based on the initiative values of these cards.'