In [1]:
pip install PyPDF2 sentence-transformers faiss-cpu transformers

Collecting sentence-transformers
  Using cached sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers
  Using cached transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Using cached sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
Using cached transformers-4.46.3-py3-none-any.whl (10.0 MB)
Installing collected packages: transformers, sentence-transformers
Successfully installed sentence-transformers-3.3.1 transformers-4.46.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\TE\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [8]:
import os
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline, LlamaTokenizer
import faiss
import numpy as np
import requests

In [None]:


# Step 1: Extract Text from PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    title = reader.metadata.get('/Title', None)  # Extract the title from metadata
    title = title if title else "Unknown Title"
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text, title

# Step 2: Chunk Text
def chunk_text(text, chunk_size=500):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield " ".join(words[i:i + chunk_size])

# Step 3: Create Embeddings and FAISS Index
def create_faiss_index(chunks, model):
    embeddings = [model.encode(chunk) for chunk in chunks]
    embeddings = np.array(embeddings).astype("float32")

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    return index, embeddings


def generate_answer(query, context):
    prompt = f"Context: {context}\n\nQuestion: {query}"
    return query_ollama(prompt)

# Step 4: Retrieve Relevant Chunks
def retrieve(query, index, model, chunks, top_k=3):
    query_embedding = model.encode(query).astype("float32")
    distances, indices = index.search(query_embedding.reshape(1, -1), top_k)
    return [chunks[i] for i in indices[0]]

def retrieve_with_metadata(query, index, model, chunks, metadata, top_k=3):
    query_embedding = model.encode(query).astype("float32")
    distances, indices = index.search(query_embedding.reshape(1, -1), top_k)
    retrieved_chunks = [chunks[i] for i in indices[0]]
    retrieved_metadata = [metadata[i] for i in indices[0]]
    return retrieved_chunks, retrieved_metadata

def truncate_context(context, query, max_tokens=2048):
    # Reserve space for the query and additional prompt text
    reserved_tokens = 300  # Adjust this based on the length of your query
    max_context_tokens = max_tokens - reserved_tokens

    # Truncate context to fit within the limit
    context_tokens = context.split()  # Tokenize the context
    if len(context_tokens) > max_context_tokens:
        context = " ".join(context_tokens[:max_context_tokens])

    return context


def query_ollama(prompt, model="llama3.2", server_url="http://localhost:11435/api/generate"):
    headers = {"Content-Type": "application/json"}
    payload = {"model": model, "prompt": prompt}

    response = requests.post(server_url, headers=headers, json=payload, stream=True)
    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code} - {response.text}")

    # Print and collect the streamed response
    print("Response: ", end="", flush=True)  # Start the response line
    full_response = ""
    for line in response.iter_lines():
        if line:
            data = json.loads(line.decode("utf-8"))
            part = data.get("response", "")
            print(part, end="", flush=True)  # Print the response part immediately
            full_response += part
            if data.get("done", False):
                break

    print()  # Finish the response line
    return full_response

def summarize_context(context):
    prompt = f"Summarize the following text:\n\n{context}"
    return query_ollama(prompt)

# Step 1: Extract text from multiple PDFs
def process_multiple_pdfs(folder_path):
    all_chunks = []
    chunk_metadata = []  # Store metadata for each chunk
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            print(f"Processing: {pdf_path}")

            # Extract text from the PDF
            full_text, title = extract_text_from_pdf(pdf_path)

            # Chunk the text
            chunks = list(chunk_text(full_text))
            all_chunks.extend(chunks)

            # Add metadata (file name) for each chunk
            chunk_metadata.extend([{"file_name": filename, "title": title, "page_number": page_number} for page_number, chunk in enumerate(chunks)])

    return all_chunks, chunk_metadata


# Small and fast embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Folder containing multiple PDFs
folder_path = "reports"  # Replace with your folder path

# Process all PDFs in the folder
all_chunks, chunk_metadata = process_multiple_pdfs(folder_path)

# Step 3: Create FAISS index
index, embeddings = create_faiss_index(all_chunks, embedding_model)


print("System ready! You can now ask questions.")

Processing: reports\202407_TE_advanced_biofuels_report.pdf
System ready! You can now ask questions.


In [None]:

# Load the tokenizer for your model
tokenizer = LlamaTokenizer.from_pretrained("openlm-research/open_llama_3b")

def count_tokens(input_string):
    # Tokenize the input string
    tokenized = tokenizer(input_string, truncation=False, return_tensors="pt")
    return len(tokenized["input_ids"][0])  # Return the number of tokens

def generate_answer_with_citation(query, chunks, metadata):
    # Combine chunks into context
    context = "\n\n".join(chunks)


    # Measure total tokens in context and query
    total_tokens = count_tokens(context) + count_tokens(query)
    if total_tokens > max_tokens:
        print("Truncating context ...")
        print(f"Total tokens ({total_tokens}) exceed the limit ({max_tokens}). Truncating context...")
        context = truncate_context(context, query, max_tokens)

    prompt = f"Context: {context}\n\nQuestion: {truncated_context}"

    print("Querying LLM with prompt: ", prompt)

    # Generate answer using the model
    answer = query_ollama(prompt)

    # Add citations to the answer
    citations = [f"Source: {meta['file_name']}, Title: {meta['title']}, Page: {meta['page_number']}" for meta in metadata]
    citation_text = "\n".join(citations)

    return f"{answer}\n\nCitations:\n{citation_text}"

ImportError: 
LlamaTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [6]:
#query = input("\nEnter your question (or 'exit' to quit): ")

# Step 3: Query and retrieve relevant chunks with metadata
query = r" Is the statement true or false?  @ FuelsEurope @ @FuelsEurope - Oct 22, 2021\
\
Asstudy from @imperialcollege shows there is sufficient sustainable\
biomass feedstock available to support an ambitious\
#lowcarbonliquidfuels strategy for EU transport. Read @DG FuelsEurope\
op-ed in @POLITICOEurope today: politi.co/3E4vkoF\
\
#CleanFuelsforAll\
\
om politico.eu\
\
O18\
\
lable to support low-carbon liquid fuels in the EU\
v a |\
\
ti 120 9 470 nn"
retrieved_chunks, retrieved_metadata = retrieve_with_metadata(query, index, embedding_model, all_chunks, chunk_metadata)

# Step 4: Generate answer with citations
answer_with_citation = generate_answer_with_citation(query, retrieved_chunks, retrieved_metadata)
print(answer_with_citation)


NameError: name 'tokenizer' is not defined