In [3]:
import pdfplumber
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# Step 1: Extract Text from the PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
    return text

# Replace 'guide.pdf' with the path to your Monopoly manual PDF file
pdf_text = extract_text_from_pdf('guide.pdf')

# Step 2: Clean the Text
clean_text = re.sub(r'\s+', ' ', pdf_text)

# Step 3: Split the Text into Chunks
def split_text(text, max_length=500):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

text_chunks = split_text(clean_text)

# Step 4: Generate Embeddings for Each Chunk
#embedder = SentenceTransformer('all-mpnet-base-v2')
embedder = SentenceTransformer('all-MiniLM-L12-v2')

chunk_embeddings = embedder.encode(text_chunks)

# Step 5: Store Embeddings in FAISS
chunk_embeddings = np.array(chunk_embeddings)
embedding_dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dimension)
index.add(chunk_embeddings)
index_to_chunk = {i: chunk for i, chunk in enumerate(text_chunks)}

# Step 6: Handle User Query
user_query = input("Ask a question about Monopoly: ")
query_embedding = embedder.encode([user_query])

k = 10  # Increase the number of chunks to retrieve
distances, indices = index.search(np.array(query_embedding), k)
retrieved = list(zip(distances[0], indices[0]))
retrieved.sort(key=lambda x: x[0])
retrieved_chunks = [index_to_chunk[idx] for _, idx in retrieved]

# Optional: Print retrieved chunks to verify their content
print("\nRetrieved Chunks:")
for idx, chunk in enumerate(retrieved_chunks):
    print(f"Chunk {idx+1}:\n{chunk}\n{'-'*40}")

# Step 7: Initialize the QA Pipeline with a Larger Model
qa_pipeline = pipeline(
    'question-answering',
    model='bert-large-uncased-whole-word-masking-finetuned-squad',
    tokenizer='bert-large-uncased-whole-word-masking-finetuned-squad'
)

# Step 8: Combine Retrieved Chunks into Context
context = " ".join(retrieved_chunks)

# Clean the context
context = re.sub(r'[^a-zA-Z0-9\s.,\'"-]', '', context)
context = re.sub(r'\s+', ' ', context)

# Step 9: Handle Long Contexts with Sliding Window
max_chunk_length = 450  # Adjust based on model's max input length
context_chunks = [context[i:i+max_chunk_length] for i in range(0, len(context), max_chunk_length)]

answers = []
for chunk in context_chunks:
    result = qa_pipeline({'question': user_query, 'context': chunk})
    answers.append(result)

# Select the answer with the highest score
best_answer = max(answers, key=lambda x: x['score'])
print("\nAnswer:", best_answer['answer'])
print("Confidence Score:", best_answer['score'])



Retrieved Chunks:
Chunk 1:
You can move the value of one die, the other die, or the sum of both dice. So if you rolled a 1 and a 5, you can move 1 space, 5 spaces, or 6 spaces: \t's your choice. Mr. Monopoly: First, move the sum of the two white dice and resolve the space you land on (such as drawing a card, buying the property, paying rent, etc.). Then, one of two things will happen depending on whether or not there is still property in the bank.
----------------------------------------
Chunk 2:
Once you collect that first $200 salary, you'll use the Speed Die for the rest of the game. This means that some players will start using the die before others. 3. Once you start using the Speed Die, roll it along with the two white dice on your turn. Then do the following depending on what you rolled. 1, 2, or 3: Add this number to the roll of the two white dice. You'll zoom around the board. Bus: This lets you "get off the bus early." Look at the two white dice.
----------------------------

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Answer: Choose to play by the classic rules for buying, renting and selling properties
Confidence Score: 0.3663317561149597


In [21]:
import pdfplumber
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# Step 1: Extract Text from the PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
    return text

pdf_text = extract_text_from_pdf('guide.pdf')

# Step 2: Clean the Text
clean_text = re.sub(r'\s+', ' ', pdf_text)
clean_text = re.sub(r'[^a-zA-Z0-9\s.,\'"-]', '', clean_text)

# Step 3: Split the Text into Chunks with Overlap
def split_text(text, max_length=500, overlap=50):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            # Start new chunk with overlap
            current_chunk = current_chunk[-overlap:] + sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = split_text(clean_text)

# Step 4: Generate Embeddings for Each Chunk
embedder = SentenceTransformer('all-MiniLM-L12-v2')
chunk_embeddings = embedder.encode(text_chunks)

# Step 5: Store Embeddings in FAISS
chunk_embeddings = np.array(chunk_embeddings)
embedding_dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dimension)
index.add(chunk_embeddings)
index_to_chunk = {i: chunk for i, chunk in enumerate(text_chunks)}

# Step 6: Handle User Query
user_query = input("Ask a question about Monopoly: ")
query_embedding = embedder.encode([user_query])

k = 15  # Increase the number of chunks to retrieve
distances, indices = index.search(np.array(query_embedding), k)
retrieved = list(zip(distances[0], indices[0]))
retrieved.sort(key=lambda x: x[0])
retrieved_chunks = [index_to_chunk[idx] for _, idx in retrieved]

# Optional: Print retrieved chunks to verify their content
print("\nRetrieved Chunks:")
for idx, chunk in enumerate(retrieved_chunks):
    print(f"Chunk {idx+1}:\n{chunk}\n{'-'*40}")

# Step 7: Initialize the QA Pipeline with a Different Model
qa_pipeline = pipeline(
    'question-answering',
    model='deepset/roberta-base-squad2',
    tokenizer='deepset/roberta-base-squad2'
)

# Step 8: Combine Retrieved Chunks into Context
context = " ".join(retrieved_chunks)

# Step 9: Handle Long Contexts with Sliding Window
max_chunk_length = 250  # Adjust based on model's max input length
context_chunks = [context[i:i+max_chunk_length] for i in range(0, len(context), max_chunk_length)]

answers = []
for chunk in context_chunks:
    result = qa_pipeline({'question': user_query, 'context': chunk})
    answers.append(result)

# Print all answers and their scores
for ans in answers:
    print(f"\nAnswer: {ans['answer']}")
    print(f"Confidence Score: {ans['score']}")

# Select the answer with the highest score
best_answer = max(answers, key=lambda x: x['score'])
print("\nBest Answer:", best_answer['answer'])
print("Best Confidence Score:", best_answer['score'])



Retrieved Chunks:
Chunk 1:
of any kind. This is just a "free" resting place. HOUSES When you own all the propertesin a color-group you may buy houses from the Bank and erect them on those properties. If you buy one house, you may put it on any one of those properties. The next house you buy must be erected on one of the unimproved properties of this or any other complete color- group you may own. The price you must pay the Bank for each house is shown on your ltle Deed card for the property on which you erect the house.
----------------------------------------
Chunk 2:
ls must be sold at auction to the highest bidder. SELLING PROPERN Unimproved properties, railroads and utilties but not buildings may be sold to any player as a private transaction - for any amount the owner can get however, no property can be sold to another player if buildings are standing on any properties of that color- group. Any buildings so located must be sold back to the Bank before the owner can sell any prope