In [14]:
import pdfplumber
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Extract text from the PDF using pdfplumber
with pdfplumber.open("NU.pdf") as pdf:
    text = ""
    for page in pdf.pages:
        text += page.extract_text()

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Length of each chunk in characters
    chunk_overlap=200  # Overlap between chunks to preserve context
)

# Split the extracted text into chunks
chunks = text_splitter.split_text(text)

# Open a JSONL file to save the chunks and metadata
with open("chunks_output.jsonl", "w") as file:
    # Iterate over all chunks and save them with metadata
    for idx, chunk in enumerate(chunks):
        # Create metadata for the chunk
        metadata = {
            "chunk_number": idx + 1,  # Chunk number
            "chunk_size": len(chunk),  # Length of the chunk
            "text": chunk  # Actual chunk text
        }
        
        # Convert the metadata dictionary to a JSON string and write to the file
        file.write(json.dumps(metadata) + "\n")
        
        # Optionally print the first 10 chunks to the console
        if idx < 10:
            print(f"Chunk {idx + 1}:\n{chunk}\n")

Chunk 1:
Table of Contents
Message from the President ................................................................................................................. v
Privacy Statement ................................................................................................................................. vi
History ................................................................................................................................................ viii
National University Hymn ......................................................................................................................x
School Logo, Colors and Motto ............................................................................................................. xi
Vision, Mission and Dynamic Filipinism ............................................................................................... xii

Chunk 2:
Vision, Mission and Dynamic Filipinism .............................................

In [15]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Function to read chunks from the JSONL file
def read_jsonl(file_path):
    chunks = []
    with open(file_path, "r") as file:
        for line in file:
            metadata = json.loads(line)  # Read each line and parse as JSON
            chunks.append(metadata["text"])  # Extract chunk text
    return chunks

# Read chunks from the JSONL file
chunks = read_jsonl("chunks_output.jsonl")

# Initialize the Sentence-BERT model (or other embedding model)
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can use any other Sentence-BERT model

# Embed the chunks into vectors (embeddings)
embeddings = model.encode(chunks)

# Convert the embeddings to a NumPy array (FAISS requires NumPy arrays)
embeddings_np = np.array(embeddings)

# Initialize a FAISS index (using L2 distance for semantic similarity)
index = faiss.IndexFlatL2(embeddings_np.shape[1])  # L2 distance index

# Add the embeddings to the FAISS index (this creates the vector database)
index.add(embeddings_np)

# Save the FAISS index to a file for later use
faiss.write_index(index, "vector_database.index")

print(f"FAISS index with {len(chunks)} vectors saved successfully!")

FAISS index with 391 vectors saved successfully!


In [20]:
import ollama

# Define the model name
model = "mistral:instruct"

# Send a prompt to the model
response = ollama.chat(model=model, messages=[
    {"role": "user", "content": "What is the capital of France?"}
])

# Print the entire response to inspect its structure
print(response['message']['content'])

 The capital of France is Paris. It's one of the most famous cities in the world, known for its iconic landmarks like the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral. Paris has been a major center of art, science, politics, and culture since the 17th century.
