In [15]:
import pdfplumber
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Extract text from the PDF using pdfplumber
with pdfplumber.open("NU.pdf") as pdf:
    text = ""
    for page in pdf.pages:
        text += page.extract_text()

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Length of each chunk in characters
    chunk_overlap=200  # Overlap between chunks to preserve context
)

# Split the extracted text into chunks
chunks = text_splitter.split_text(text)

# Open a JSONL file to save the chunks and metadata
with open("chunks_output.jsonl", "w") as file:
    # Iterate over all chunks and save them with metadata
    for idx, chunk in enumerate(chunks):
        # Create metadata for the chunk
        metadata = {
            "chunk_number": idx + 1,  # Chunk number
            "chunk_size": len(chunk),  # Length of the chunk
            "text": chunk  # Actual chunk text
        }
        
        # Convert the metadata dictionary to a JSON string and write to the file
        file.write(json.dumps(metadata) + "\n")
        
        # Optionally print the first 10 chunks to the console
        if idx < 10:
            print(f"Chunk {idx + 1}:\n{chunk}\n")

Chunk 1:
Table of Contents
Message from the President ................................................................................................................. v
Privacy Statement ................................................................................................................................. vi
History ................................................................................................................................................ viii
National University Hymn ......................................................................................................................x
School Logo, Colors and Motto ............................................................................................................. xi
Vision, Mission and Dynamic Filipinism ............................................................................................... xii

Chunk 2:
Vision, Mission and Dynamic Filipinism .............................................

In [17]:
import json
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Function to read chunks from the JSONL file
def read_jsonl(file_path):
    chunks = []
    with open(file_path, "r") as file:
        for line in file:
            metadata = json.loads(line)  # Read each line and parse as JSON
            chunks.append(metadata["text"])  # Extract chunk text
    return chunks

# Read chunks from the JSONL file
chunks = read_jsonl("chunks_output.jsonl")

# Initialize the Sentence-BERT model through LangChain (HuggingFaceEmbeddings)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Build the FAISS index directly from your chunks using LangChain
vectorstore = FAISS.from_texts(chunks, embedding_model)

# Save the FAISS index to disk for later use
vectorstore.save_local("vector_databases.index")

print(f"LangChain FAISS index with {len(chunks)} vectors saved successfully!")

LangChain FAISS index with 391 vectors saved successfully!


In [18]:
# Load the FAISS index from disk
vectorstore = FAISS.load_local("vector_databases.index", embedding_model, allow_dangerous_deserialization=True)

# Query the index
query = "message from the president and what is it about"
results = vectorstore.similarity_search(query, k=5)  # Get top 5 most similar chunks

# Display the results
for idx, doc in enumerate(results, start=1):
    print(f"[{idx}] {doc.page_content}\n")

[1] Table of Contents
Message from the President ................................................................................................................. v
Privacy Statement ................................................................................................................................. vi
History ................................................................................................................................................ viii
National University Hymn ......................................................................................................................x
School Logo, Colors and Motto ............................................................................................................. xi
Vision, Mission and Dynamic Filipinism ............................................................................................... xii

[2] the University.
Aside from the norms in this handbook, bulletin board postings, electronic ann

In [19]:
import ollama

# Define the model name
model = "mistral:instruct"

# Prepare the retrieved content for the Mistral model prompt
retrieved_text = "\n\n".join([doc.page_content for doc in results])

# Formulate the prompt including the retrieved context
prompt = f"Here are some documents related to your query:\n\n{retrieved_text}\n\nBased on the information above, answer the following question: {query}"

# Send the prompt to the Mistral model
response = ollama.chat(model=model, messages=[{"role": "user", "content": prompt}])

# Print the response content
print(response['message']['content'])

 The message from the President is an introduction to the 2022 Student Handbook. It is a statement by RENATO CARLOS H. ERMITA, Jr., PhD, President/CEO of the National University. The message provides information about the handbook, its purpose, and its effective date. It also mentions that the continued attendance of any student at the National University subjects them to this authority and emphasizes the importance of familiarizing oneself with the contents of the handbook as members of the National University community.
