## Overlaping Acts Offline Models

In [1]:
# Import required libraries
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


  from tqdm.autonotebook import tqdm, trange


### Step 1: Extract text from the Acts
Extracting and combining the texts in a single file

In [17]:
from PyPDF2 import PdfReader
import os

def extract_text_from_pdfs_to_single_file(folder_path, output_file):
    """
    Extracts text from all PDFs in a folder and saves to a single output file.
    Skips files that cannot be processed due to errors.

    Args:
        folder_path (str): Path to the folder containing PDFs.
        output_file (str): Path to save the combined text output.
    """
    skipped_files = []  # To log files that were skipped due to errors
    with open(output_file, "w", encoding="utf-8") as output:
        for filename in os.listdir(folder_path):
            if filename.endswith(".pdf"):
                file_path = os.path.join(folder_path, filename)
                try:
                    # Try reading the PDF
                    reader = PdfReader(file_path)
                    text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
                    # Write extracted text to the output file
                    output.write(f"--- Start of {filename} ---\n")
                    output.write(text)
                    output.write(f"\n--- End of {filename} ---\n")
                    print(f"Processed: {filename}")
                except Exception as e:
                    # Catch all exceptions and skip the file
                    skipped_files.append((filename, str(e)))
                    print(f"Skipped {filename} due to error: {e}")

    # Log skipped files
    if skipped_files:
        print("\nSkipped files:")
        for file, error in skipped_files:
            print(f"- {file}: {error}")

# Usage
folder_path = os.getcwd()  # Use the current working directory
output_file = os.path.join(folder_path, "combined_extracted_texts.txt")
extract_text_from_pdfs_to_single_file(folder_path, output_file)


Processed: Biosafety Act.pdf
Processed: Climate Change (Public Participation and Access to Climate Change Information) Regulations 2023.pdf
Processed: Climate Change Act.pdf
Processed: Community Land Act.pdf
Processed: Declaration of Construction Minerals.pdf
Processed: Declaration of Environmentally Significant Area (1).pdf
Processed: Declaration of Environmentally Significant Area.pdf
Skipped Declaration of Forest Area.pdf due to error: EOF marker not found
Skipped Declaration of Forest AreaGOT Achama Forest.pdf due to error: EOF marker not found
Skipped Declaration of Forest Areas.pdf due to error: EOF marker not found
Skipped Declaration of Forest AreaWote Tree Nursery and Arboretum.pdf due to error: Invalid Elementary Object starting with b'\xa6' @105761: b'\xf8E\xd6|\xf6Y\x828\xdf\x9d}#\xdb\xfd\xed\x85\nX\xdfL\xa6\n\x87Y\xb5\x0e\xef\x0f1\xca+\xc9\xff[\tR\xdc\x93L\xd6\xe1\xac\xc2\x03\x01\xfaVOe1\x8a\xa8<Q\x1d\xb2\xfe0045\x9aQu\xe5\x9b\xbb?L\xecV\xd5}\xfe\xbdVbbs\xda'
Processed: De

In [15]:
import os
from PyPDF2 import PdfReader

def extract_text_from_pdfs_to_single_file(folder_path, output_file):
    """
    Extracts text from all .pdf files in the given folder and saves the combined content to a single .txt file.
    
    Args:
        folder_path (str): Path to the folder containing PDF files.
        output_file (str): Path to the single output .txt file.
    """
    combined_text = ""
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            reader = PdfReader(file_path)
            text = ""
            
            # Extract text from each page of the PDF
            for page in reader.pages:
                text += page.extract_text()
            
            # Add the filename and its content to the combined text
            combined_text += f"\n{'='*80}\n{filename}\n{'='*80}\n"
            combined_text += text
    
    # Save the combined content to the output file
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(combined_text)
    
    print(f"All extracted content has been saved to: {output_file}")

# Set the folder paths
root_folder = os.getcwd()  # Set to current working directory or specify another folder path
output_file = os.path.join(root_folder, "combined_extracted_texts.txt")

# Extract and save all PDF content to a single file
extract_text_from_pdfs_to_single_file(root_folder, output_file)



PdfReadError: EOF marker not found

### Step 2: Split Text into Chunks

Split the text into manageable chunks to enable efficient retrieval.

In [2]:
from langchain.text_splitter import CharacterTextSplitter

def split_text_into_chunks(file_path):
    """
    Splits the combined text into chunks for vectorization.
    
    Args:
        file_path (str): Path to the combined text file.
        
    Returns:
        list: List of text chunks.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    return text_splitter.split_text(text)

# Usage
file_path = "combined_extracted_texts.txt"
text_chunks = split_text_into_chunks(file_path)


Created a chunk of size 2165, which is longer than the specified 1000
Created a chunk of size 1394, which is longer than the specified 1000
Created a chunk of size 2239, which is longer than the specified 1000
Created a chunk of size 1013, which is longer than the specified 1000
Created a chunk of size 1233, which is longer than the specified 1000
Created a chunk of size 1449, which is longer than the specified 1000
Created a chunk of size 1131, which is longer than the specified 1000
Created a chunk of size 1151, which is longer than the specified 1000
Created a chunk of size 1292, which is longer than the specified 1000
Created a chunk of size 1091, which is longer than the specified 1000
Created a chunk of size 1491, which is longer than the specified 1000
Created a chunk of size 1786, which is longer than the specified 1000
Created a chunk of size 1313, which is longer than the specified 1000
Created a chunk of size 1079, which is longer than the specified 1000
Created a chunk of s

### Step 3: Generate Embeddings and Build FAISS Index

Generate embeddings locally using SentenceTransformer and index them using FAISS.

In [3]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def build_faiss_index(text_chunks):
    """
    Builds a FAISS index from text chunks.
    
    Args:
        text_chunks (list): List of text chunks.
        
    Returns:
        tuple: FAISS index and the list of text chunks.
    """
    # Load pre-trained Sentence Transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient local model
    
    # Generate embeddings for text chunks
    embeddings = model.encode(text_chunks, show_progress_bar=True)
    
    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    
    return index, text_chunks, model

# Usage
faiss_index, indexed_chunks, transformer_model = build_faiss_index(text_chunks)


Batches: 100%|██████████| 314/314 [10:23<00:00,  1.99s/it]


### Step 4: Implement the Query Function
Use FAISS to retrieve relevant chunks based on user queries.

In [5]:
def retrieve_responses(query, index, chunks, model, top_k=3):
    """
    Retrieves the top-k relevant chunks based on the user's query.
    
    Args:
        query (str): User's query.
        index (faiss.Index): FAISS index containing embeddings.
        chunks (list): List of text chunks.
        model (SentenceTransformer): Sentence Transformer model for query embedding.
        top_k (int): Number of top results to retrieve.
        
    Returns:
        list: List of top-k relevant text chunks.
    """
    # Generate embedding for the query
    query_embedding = model.encode([query])
    
    # Search the FAISS index
    distances, indices = index.search(query_embedding, top_k)
    
    # Retrieve the corresponding chunks
    results = [chunks[i] for i in indices[0]]
    return results

# Usage
while True:
    user_query = input("Enter your question (type 'exit' to quit): ")
    if user_query.lower() in ['exit', 'quit']:
        print("Exiting the agent. Goodbye!")
        break
    
    responses = retrieve_responses(user_query, faiss_index, indexed_chunks, transformer_model)
    print("\nTop Responses:\n")
    for response in responses:
        print(f"- {response}\n")



Top Responses:

- Part I – PRELIMINARY
1.Citation
These Regulations may be cited as the Biosafety (Contained Use) Regulations.
2.Interpretation
In these Regulations unless the context otherwise requires—
"accident" means any incident involving a signiﬁcant and unintended release of genetically modiﬁed
organisms in the course of their contained use which could present an immediate or delayed hazard to
human health and the environment;
"applicant" means a person making an application under these Regulations;
"Authority" means the National Biosafety Authority established under section 5 of the Act;
"Biosafety Clearing-House" means a mechanism for exchange of scientiﬁc, technical, environmental,
socio-economic and legal information and experience with genetically modiﬁed organism;
"conﬁned ﬁeld trial" means any activity undertaken within a ﬁeld and which involves genetically modiﬁed
organisms which are controlled by speciﬁc measures to ensure safety for humans and for the environment;

- 

In [6]:
from sentence_transformers import SentenceTransformer
import faiss
import os

def offline_conversational_agent(output_file):
    # Read the combined text
    with open(output_file, "r", encoding="utf-8") as f:
        text = f.read()

    # Split text into chunks
    from langchain.text_splitter import CharacterTextSplitter
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    text_chunks = text_splitter.split_text(text)

    # Generate embeddings locally
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Download model locally
    embeddings = model.encode(text_chunks, show_progress_bar=True)

    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    # Create a retriever function
    def retrieve(query):
        query_embedding = model.encode([query])
        distances, indices = index.search(query_embedding, k=3)
        print("Top matches:")
        for i in indices[0]:
            print(f"- {text_chunks[i]}")

    # Start conversation
    print("Ask your question:")
    while True:
        question = input("> ")
        if question.lower() in ["exit", "quit"]:
            print("Exiting the conversation. Goodbye!")
            break
        retrieve(question)

# Run Offline Agent
output_file = os.path.join(os.getcwd(), "combined_extracted_texts.txt")
offline_conversational_agent(output_file)


Created a chunk of size 2165, which is longer than the specified 1000
Created a chunk of size 1394, which is longer than the specified 1000
Created a chunk of size 2239, which is longer than the specified 1000
Created a chunk of size 1013, which is longer than the specified 1000
Created a chunk of size 1233, which is longer than the specified 1000
Created a chunk of size 1449, which is longer than the specified 1000
Created a chunk of size 1131, which is longer than the specified 1000
Created a chunk of size 1151, which is longer than the specified 1000
Created a chunk of size 1292, which is longer than the specified 1000
Created a chunk of size 1091, which is longer than the specified 1000
Created a chunk of size 1491, which is longer than the specified 1000
Created a chunk of size 1786, which is longer than the specified 1000
Created a chunk of size 1313, which is longer than the specified 1000
Created a chunk of size 1079, which is longer than the specified 1000
Created a chunk of s

Ask your question:
Top matches:
- "11. (1) In the performance of their functions, a fisheries inspector shall have all the powers conferred on an authorized officer under the Act. Powers of fisheries inspectors. (2) Without prejudice to sub-regulation (1), a fisheries inspector may—  (a)!enter and search, at any reasonable hour, any premises wherein fish, fish products or fish feed are likely to be produced, handled, processed, packaged and stored or kept, to determine the existence, nature and extent of any trade or business in fish, fish products or fish feed;  (b)!examine any fish, fish product, aquaculture product or fish feed in any fishery enterprise;   (c)!take samples of any fish, fish product or fish feed in the course of undertaking inspections under these regulation;  (d)!examine anything used or capable of being used for the preparation of any fish, fish product or fish feed in any fishery enterprise;  (e)!stop, search or detain any carrier likely to contain any fish, fish 