In [1]:
!pip install hf_xet
!pip install transformers sentence-transformers faiss-cpu numpy torch
!pip install PyPDF2


Collecting transformers
  Using cached transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting torch
  Using cached torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Using cached nvidia_cuda_runtime

In [18]:
import PyPDF2
import os

def extract_text_from_pdf(pdf_path):
    """Extract text from a single PDF file."""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + " "
            return text.strip()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

def load_pdfs_from_directory(directory, chunk_size=1000):
    """Load text from all PDFs in a directory."""
    documents = []
    for filename in os.listdir(directory):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory, filename)
            text = extract_text_from_pdf(pdf_path)
            if text:
                # Split long text into smaller chunks (optional, for better retrieval)
                chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]  # Split into ~1000 char chunks
                documents.extend(chunks)
    return documents


In [34]:
cwd = os.getcwd()
pdf_directory = cwd+"/Database"

documents = load_pdfs_from_directory(pdf_directory, chunk_size=500)
print(len(documents))

216


In [35]:
# Sample documents
# documents = [
#     "The capital of France is Paris. It is known for the Eiffel Tower and rich cultural history.",
#     "Python is a versatile programming language used for web development, data science, and AI.",
#     "The theory of relativity was developed by Albert Einstein in the early 20th century.",
#     "Machine learning is a subset of artificial intelligence that focuses on building systems that learn from data."
# ]

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch

# Load a pre-trained sentence transformer model
retriever_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda' if torch.cuda.is_available() else 'cpu')
# Encode the documents
document_embeddings = retriever_model.encode(documents, convert_to_numpy=True)

# Create a FAISS index for similarity search
dimension = document_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity
index.add(document_embeddings)  # Add document embeddings to the index

# Function to retrieve top-k relevant documents
def retrieve_documents(query, k=5):
    query_embedding = retriever_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    return [documents[idx] for idx in indices[0]]

In [36]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import login

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1
print(torch.cuda.is_available())  # Should print: True

# Load a generative model
# model_name = "mistralai/Mistral-7B-Instruct-v0.3"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     load_in_4bit=True,  # Quantization for GPU
#     device_map="auto"   # Auto-map to GPU/CPU
# )
# generator = pipeline('text-generation', model=model,tokenizer=tokenizer)

generator = pipeline('text2text-generation', model='google/flan-t5-large', device=device)
# Function to generate an answer
def generate_answer(query, retrieved_docs):
    # Combine retrieved documents into a context
    context = " ".join(retrieved_docs)
    prompt = f"Question: {query}\nContext: {context}\nAnswer:"
    response = generator(prompt, max_length=500, num_return_sequences=1)
    return response, response[0]['generated_text'].replace("Answer:", "").strip()

True


Device set to use cuda:0


In [37]:
def rag_application(query, k=2):
    # Step 1: Retrieve relevant documents
    retrieved_docs = retrieve_documents(query, k)
    print("Retrieved Documents:", retrieved_docs)
    
    # Step 2: Generate answer using retrieved documents
    response, answer = generate_answer(query, retrieved_docs)
    return response, answer

In [38]:
query = "What does human driving data provide?"
response,answer = rag_application(query)
print("Query:", query)
print("Response:", response)
print("Answer:", answer)



Both `max_new_tokens` (=256) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Retrieved Documents: [' \n• The applicable safety performance metrics.  \n• The data that can be leveraged to define safety performance reference values.  \nThis best practice outl ines a process for leveraging human driving data to establish safety performance targets for \nADS-DV behaviors. The targets within the specific use -case , exemplified in this best practice , are based on \nnaturalistic driving data from manually driven vehicles , in the hope of aiding understanding from a broad audience \nof stakeholders.  \nSev', 'tent with public expectations. People are already accustomed \nto the risks associated with human -driven vehicles, and they expect  ADS-DVs to outperform average human drivers.  \nThroughout the docume nt, we leverage an example use -case associated with a specific behavioral competency  \nand analyze human drivers ’ behaviors through the usage of NDS. As mentioned previously , the outlined process  \ncan be generalized and abstracted to remain applicable to ot