In [74]:
import os
#import openai
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import AzureOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI


In [75]:
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2023-12-01-preview",
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") or ""
    )
# This corresponds to the custom name when we deployed a model on the Azure OpenAI service
deployment_name='training-gpt35-deployment-001' 


In [76]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

text = extract_text_from_pdf('pdf/Wiltshire Item10a_Appendix 1 - MTFS.pdf')

In [77]:
def split_text(text, chunk_size=500, chunk_overlap=50):
    """Split text into smaller chunks."""
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

splits = split_text(text)


In [78]:
def retrieve_relevant_chunks(chunks, query, top_n=3):
    """Retrieve the most relevant chunks based on keyword matching."""
    return sorted(chunks, key=lambda x: query.lower() in x.lower(), reverse=True)[:top_n]


The code cell belows shows  exactly how the function above works.

In [None]:
my_list = ['aaa', 'bbb', 'ccc', 'ddd', 'eee', 'abc', 'def', 'ghi', 'jkl', 'mno', 'aaz', 'azz']
test_result = retrieve_relevant_chunks(my_list, "a", top_n=3)
test_result

In [80]:
def retrieve_relevant_chunks2(chunks, query, top_n=3):
    """Retrieve the most relevant chunks based on a count of the number of times the query appears in an element of chunks."""
    return sorted(chunks, key=lambda chunk: chunk.lower().count(query.lower()), reverse=True)[:top_n]

In [None]:
# This function works better since it sort on the countr, not simply the existing of the substring in the chunk
test_result = retrieve_relevant_chunks2(my_list, "a", top_n=3)
test_result

In [None]:
results = retrieve_relevant_chunks(splits, "finance")
results

In [83]:
def create_vector_store(chunks):
    """Convert chunks to embeddings and store in a FAISS vector database."""
    embeddings = OpenAIEmbeddings()
    vector_store = FAISS.from_texts(chunks, embeddings)
    return vector_store

In [84]:
def build_prompt(context, query):
    """ build the RAG prompt from the context and query """
    return f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

#print(build_prompt("context......", "questions"))

In [85]:
def generate_response(context, query):
    """Generate a response from the prompt."""
    prompt = build_prompt(context, query)
    response = client.completions.create(model=deployment_name, prompt=prompt, max_tokens=1000)
    return response.choices[0].text.strip()

In [None]:
pdf_files = ['pdf/' + file for file in os.listdir('pdf')]
pdf_files


In [87]:
# pdf_files = [
#     'pdf/derbyshire-dales-medium-term-financial-plan-appendix-1.pdf', 
#     'pdf/middlesborough-council-medium-term-financial-plan.pdf'
# ]

# Extract text from PDFs and split into chunks
all_chunks = []
for pdf_path in pdf_files:
    text = extract_text_from_pdf(pdf_path)
    chunks = split_text(text)
    all_chunks.extend(chunks)


In [None]:
vector_store = create_vector_store(all_chunks)
vector_store


Based on the  query, find the top n matching chunks and combines these into a single text string 

In [None]:
query = "Are the finances of each council on a sound basis?"

search_results = vector_store.similarity_search(query, top_k=10)
contents = [result.page_content for result in search_results]
context = '\n\n'.join(contents)

context


In [None]:
# Use OpenAI to generate a response
response = generate_response(context, query)
print("Response:", response)

