In [None]:
'''Get Vector Data Base and Dependencies'''
!pip install -U langchain-community
!pip install sentence-transformers
!pip install chromadb


tyes


In [None]:
'''Set Up the document Ingestion and Vectorization process'''
import re
import pandas as pd

def load_text(file_path: str) -> str:
    """Load a .txt"""
    with open(file_path, 'r') as f:
        return f.read()

def clean_text(text: str) -> str:
    """Clean text by removing bullets and whitespace."""
    text = text.replace('•', '')
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

def split_by_heading(text: str):
    """
    Split text into sections based on headings like '1.', '2.1', etc.
    Returns a list of (heading, section_text).
    """
    lines = text.split('\n')
    sections = []
    current_heading = 'Introduction'
    current_lines = []

    for line in lines:
        if re.match(r'^\d+(\.\d+)*\s', line):
            # Save previous section
            if current_lines:
                sections.append((current_heading, '\n'.join(current_lines).strip()))
            current_heading = line.strip()
            current_lines = []
        else:
            current_lines.append(line)
    if current_lines:
        sections.append((current_heading, '\n'.join(current_lines).strip()))

    return sections

def chunk_section(heading: str, content: str, chunk_size: int = 500, overlap: int = 100):
    """
    Chunk a section into overlapping word-based chunks.
    Returns a list of dicts with metadata and chunk text.
    """
    words = content.split()
    chunks = []
    start = 0
    chunk_id = 0

    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk_words = words[start:end]
        chunk_text = ' '.join(chunk_words)
        chunks.append({
            'heading': heading,
            'chunk_id': chunk_id,
            'word_count': len(chunk_words),
            'text': chunk_text
        })
        chunk_id += 1
        start += (chunk_size - overlap)

    return chunks

def preprocess_file(file_path: str, chunk_size: int = 500, overlap: int = 100) -> pd.DataFrame:
    """
    Full preprocessing pipeline: load, clean, split, and chunk.
    Returns a pandas DataFrame with all chunks and metadata.
    """
    raw_text = load_text(file_path)
    cleaned = clean_text(raw_text)
    sections = split_by_heading(cleaned)

    all_chunks = []
    for heading, content in sections:
        all_chunks.extend(chunk_section(heading, content, chunk_size, overlap))

    return pd.DataFrame(all_chunks)

# Example usage:
# df_chunks = preprocess_file('path/to/your/document.txt', chunk_size=200, overlap=50)
# df_chunks.to_csv('preprocessed_chunks.csv', index=False)

In [None]:
df_chunks = preprocess_file('/content/drive/MyDrive/TESTTST/sample.txt', chunk_size=200, overlap=50)
df_chunks.to_csv('preprocessed_chunks.csv', index=False)

In [None]:
'''Set up Vector DB'''
import chromadb
client = chromadb.Client()
collection = client.get_or_create_collection("reguai_compliance")
client.delete_collection("reguai_compliance")
names = client.list_collections()

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb

# 1. Assume df_chunks is your preprocessed DataFrame
#    with columns: 'text', 'heading', 'chunk_id'

# 2. Initialize embedder
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 3. Convert to lists
texts     = df_chunks['text'].tolist()
metadatas = df_chunks[['heading', 'chunk_id']].to_dict('records')
ids       = [f"chunk_{i}" for i in range(len(texts))]

# 4. Instantiate the new PersistentClient
client = chromadb.PersistentClient(path="./chroma_db")

# 5. Create or retrieve collection
collection = client.get_or_create_collection("reguai_compliance")

# 6. Embed & add to ChromaDB
embeddings = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
collection.add(ids=ids, documents=texts, embeddings=embeddings, metadatas=metadatas)

# 7. Query example
query = "What is data minimisation?"
q_emb  = embedder.encode([query])
res    = collection.query(query_embeddings=q_emb, n_results=3)
print(res['documents'], res['metadatas'])
