In [1]:
import faiss
import numpy as np
import torch
import re
import os
from transformers import pipeline
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
import openai

In [2]:
%run keys.ipynb
openai.api_key = os.environ["OPENAI_API_KEY"]

Keys available:  ['OPENAI_KEY']


In [3]:
# Load the file
file_path = "scraped_data.txt"

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

entries = raw_text.split("\n\n")

# Function to clean text content
def clean_text_content(text):
    # Remove JavaScript artifacts and unwanted phrases
    patterns_to_remove = [
        r"Up\^Add To My Favorites",  
        r"Add To My Favorites",      
        r"Up",                       
        r"\[.*?\]",                  
        r"Code Text.*?:",            
        r"DIVISION\s*\d+.*?CHAPTER\s*\d+",  
        r"https?://\S+",             
    ]
    for pattern in patterns_to_remove:
        text = re.sub(pattern, "", text)
    
    # Remove excessive whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Updated function to extract entry details
def extract_entry_details(entry):
    # Define regular expressions to capture URL and code
    url_pattern = r"URL:\s*(https?://\S+)"
    code_pattern = r"DIVISION\s*(\d+).*(CHAPTER\s*\d+)"
    text_pattern = r"Text Content:\s*(.*)"

    url = re.search(url_pattern, entry)
    code = re.search(code_pattern, entry)
    text = re.search(text_pattern, entry)

    # Extract values or set None if not found
    url = url.group(1) if url else None
    code = code.group(0) if code else None
    text = clean_text_content(text.group(1)) if text else None

    return {"url": url, "code": code, "text": text}

# Function to detect index-like entries
def is_index_like(text):
    # Common patterns in index-like entries
    index_keywords = ["CHAPTER", "PART", "ARTICLE", "DIVISION"]
    # Count occurrences of index keywords
    keyword_count = sum(text.count(keyword) for keyword in index_keywords)
    # Count the number of lines with enumerations
    enumeration_count = len(re.findall(r"\d+\.", text))  # Matches "1.", "2.", etc.
    # Count non-alphabetic characters
    non_alpha_ratio = sum(1 for c in text if not c.isalpha()) / max(1, len(text))
    
    # Thresholds for detecting index-like content
    if keyword_count > 5 or enumeration_count > 5 or non_alpha_ratio > 0.5:
        return True
    return False

# Apply the filter during document extraction
documents = [extract_entry_details(entry) for entry in entries]

# Remove entries with missing text content
documents = [doc for doc in documents if doc["text"]]

# Filter out index-like documents
filtered_documents = [doc for doc in documents if not is_index_like(doc["text"])]

# Create a Hugging Face Dataset with the filtered data
corpus = Dataset.from_dict({
    "url": [doc["url"] for doc in filtered_documents],
    "code": [doc["code"] for doc in filtered_documents],
    "text": [doc["text"] for doc in filtered_documents]
})

print(f"URL: {corpus['url'][100]}")
print(f"Code: {corpus['code'][100]}")
print(f"Text: {corpus['text'][100]}")

URL: https://leginfo.legislature.ca.gov/faces/codes_displayText.xhtml?lawCode=CIV&division=4.&title=&part=5.&chapter=10.&article=3.
Code: DIVISION 4. GENERAL PROVISIONS [3274 - 9566]( Heading of Division 4 amended by Stats. 1988, Ch. 160, Sec. 16. )PART 5. Common Interest Developments [4000 - 6150]( Part 5 added by Stats. 2012, Ch. 180, Sec. 2. )CHAPTER 10
Text: (a) “Alternative dispute resolution” means mediation, arbitration, conciliation, or other nonjudicial procedure that involves a neutral party in the decisionmaking process. The form of alternative dispute resolution chosen pursuant to this article may be binding or nonbinding, with the voluntary consent of the parties.(b) “Enforcement action” means a civil action or proceeding, other than a cross-complaint, for any of the following purposes:(1) Enforcement of this act.(2) Enforcement of the Nonprofit Mutual Benefit Corporation Law (Part 3 (commencing with Section 7110) of Division 2 of Title 1 of the Corporations Code).(3) Enfo

In [4]:
# Load embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda")  # Move to GPU

# Embedding function with GPU support
def encode(texts, batch_size=8):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to("cuda")  # Move to GPU
        with torch.no_grad():
            embeddings = model(**inputs).last_hidden_state.mean(dim=1)
        all_embeddings.append(embeddings.cpu().numpy())  # Move back to CPU for FAISS
    return np.vstack(all_embeddings)

# Embed the corpus text
corpus_embeddings = encode(corpus["text"])

# Create a FAISS index with GPU support
res = faiss.StandardGpuResources()  # Initialize FAISS GPU resources
index_flat = faiss.IndexFlatL2(corpus_embeddings.shape[1])  # L2 distance index
index = faiss.index_cpu_to_gpu(res, 0, index_flat)  # Move index to GPU
index.add(corpus_embeddings)  # Add embeddings



In [5]:
def summarize_text(text, user_context=None, max_summary_tokens=1024):
    # Prepare the base prompt
    prompt = f"Summarize the following text:\n\n{text}"
    
    # Incorporate user context if provided
    if user_context:
        prompt = f"Considering the following user context:\n{user_context}\n\n{prompt}"

    # Use openai.ChatCompletion.create() for summarization
    response = openai.ChatCompletion.create(
        model="gpt-4",  # or "gpt-4-32k" for handling larger inputs
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=max_summary_tokens,  # Limit the number of tokens in the summary
        temperature=0.5,  # Control the randomness (set to 0 for deterministic output)
        top_p=1.0,  # Top probability sampling
        n=1,  # Single response
    )

    # Extract the summary from the response
    summary = response['choices'][0]['message']['content'].strip()
    return summary

In [6]:
# Updated retrieval function to return full document details (url, code, division, text)
def retrieve(query, top_k=3):
    query_embedding = encode([query])
    distances, indices = index.search(query_embedding, top_k)
    
    # Fetch the corresponding document details
    results = []
    for i in indices[0]:
        doc = {
            "url": corpus["url"][i],
            "code": corpus["code"][i],
            "text": corpus["text"][i]
        }
        results.append(doc)
    
    return results

In [7]:
# Load generative model
gen_model_name = "google/flan-t5-large"
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name).to("cuda")

In [8]:
# Adjust the response generation with more detailed context and parameters like temperature
def generate_response(query, documents, max_length=500, temperature=0.3, top_p=0.9):
    context = " ".join([doc["text"] for doc in documents]) # Extract "text" from each document
    input_text = f"Query: {query}\nContext: {context}\nAnswer with details:"
    inputs = gen_tokenizer(input_text, return_tensors="pt", truncation=True).to("cuda")  # Move to GPU
    with torch.no_grad():
        outputs = gen_model.generate(
            **inputs, 
            max_length=max_length,
            temperature=temperature,
            top_p=top_p
        )
    return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [9]:
def rag_pipeline(query, min_required_docs=1):
    print("Starting RAG pipeline...")
    
    # Retrieve relevant documents
    documents = retrieve(query)
    print(f"Retrieved {len(documents)} documents.")
    
    # Check if we retrieved enough documents
    if len(documents) < min_required_docs:
        print("Not enough relevant documents found.")
        return [{"response": "Sorry, I don't have the ability to answer that question based on the available information."}]
    
    # Generate a summary for each document
    summaries = []
    for doc in documents:
        summary = summarize_text(doc["text"])
        summaries.append({
            "url": doc["url"],
            "code": doc["code"],
            "summary": summary
        })  
    
    print("Generated summaries.")
    return summaries

In [10]:
# Example query and user context
query = "what is the law around restrooms in restaurants?"
user_context = "A layman unfamiliar with California law looking for legal advice."

# Use the RAG pipeline
summaries = rag_pipeline(query)

# Display the retrieved documents and summaries
if isinstance(summaries[0], dict) and 'response' in summaries[0]:
    print(summaries[0]['response'])
else:
    for summary in summaries:
        print(f"URL: {summary['url']}")
        print(f"Code: {summary['code']}")
        print(f"Summary: {summary['summary']}")
        print("-" * 80)  # Separator for clarity

Starting RAG pipeline...
Retrieved 3 documents.
Generated summaries.
URL: https://leginfo.legislature.ca.gov/faces/codes_displayText.xhtml?lawCode=HSC&division=104.&title=&part=15.&chapter=2.&article=5.
Code: DIVISION 104. ENVIRONMENTAL HEALTH [106500 - 119406]( Division 104 added by Stats. 1995, Ch. 415, Sec. 6. )PART 15. MISCELLANEOUS REQUIREMENTS [118375 - 119406]( Part 15 added by Stats. 1995, Ch. 415, Sec. 6. )CHAPTER 2
Summary: The Health and Safety Code (HSC) requires all single-user toilet facilities in any business, public place, or state or local government agency to be identified as all-gender facilities. The signage must comply with Title 24 of the California Code of Regulations, and the facility should be designated for use by one occupant at a time or for family or assisted use. During inspections, officials may check for compliance with this rule. The term "single-user toilet facility" refers to a toilet facility with no more than one occupant.
--------------------------