In [None]:
'''Get Vector Data Base and Dependencies'''
!pip install sentence-transformers
!pip install chromadb
!pip install openai


Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/45/2d/1151b371f28caae565ad384fdc38198f1165571870217aedda230b9d7497/sentence_transformers-4.1.0-py3-none-any.whl.metadata
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.41.0 from https://files.pythonhosted.org/packages/a9/b6/5257d04ae327b44db31f15cce39e6020cc986333c715660b1315a9724d82/transformers-4.51.3-py3-none-any.whl.metadata
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Obtaining dependency information for torch>=1.11.0 from https://files.pythonhosted.org/packages/aa/3f/85b56f7e2abcfa558c5fbf7b11eb02d78a4a63e6aeee2bbae3bb552abea5/torch-2.7.0-cp311-none-macosx_11_0_arm64.whl.metadata
  Downloading torch-2.7.0-cp31

In [None]:
'''API KEYS'''
key = 'PRIV'

In [4]:
'''Set Up the document Ingestion and Vectorization process'''
import re
import pandas as pd

def load_text(file_path: str) -> str:
    """Load a .txt"""
    with open(file_path, 'r') as f:
        return f.read()

def clean_text(text: str) -> str:
    """Clean text by removing bullets and whitespace."""
    text = text.replace('•', '')
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

def split_by_heading(text: str):
    """
    Split text into sections based on headings like '1.', '2.1', etc.
    Returns a list of (heading, section_text).
    """
    lines = text.split('\n')
    sections = []
    current_heading = 'Introduction'
    current_lines = []

    for line in lines:
        if re.match(r'^\d+(\.\d+)*\s', line):
            # Save previous section
            if current_lines:
                sections.append((current_heading, '\n'.join(current_lines).strip()))
            current_heading = line.strip()
            current_lines = []
        else:
            current_lines.append(line)
    if current_lines:
        sections.append((current_heading, '\n'.join(current_lines).strip()))

    return sections

def chunk_section(heading: str, content: str, chunk_size: int = 500, overlap: int = 100):
    """
    Chunk a section into overlapping word-based chunks.
    Returns a list of dicts with metadata and chunk text.
    """
    words = content.split()
    chunks = []
    start = 0
    chunk_id = 0

    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk_words = words[start:end]
        chunk_text = ' '.join(chunk_words)
        chunks.append({
            'heading': heading,
            'chunk_id': chunk_id,
            'word_count': len(chunk_words),
            'text': chunk_text
        })
        chunk_id += 1
        start += (chunk_size - overlap)

    return chunks

def preprocess_file(file_path: str, chunk_size: int = 500, overlap: int = 100) -> pd.DataFrame:
    """
    Full preprocessing pipeline: load, clean, split, and chunk.
    Returns a pandas DataFrame with all chunks and metadata.
    """
    raw_text = load_text(file_path)
    cleaned = clean_text(raw_text)
    sections = split_by_heading(cleaned)

    all_chunks = []
    for heading, content in sections:
        all_chunks.extend(chunk_section(heading, content, chunk_size, overlap))

    return pd.DataFrame(all_chunks)

# Example usage:
# df_chunks = preprocess_file('path/to/your/document.txt', chunk_size=200, overlap=50)
# df_chunks.to_csv('preprocessed_chunks.csv', index=False)

In [5]:
ls

BE_test.ipynb         [34mReguAI-App[m[m/           sample.txt
README.md             embeddings_rag.ipynb


In [6]:
df_chunks = preprocess_file('sample.txt', chunk_size=200, overlap=50)
df_chunks.to_csv('preprocessed_chunks.csv', index=False)

In [7]:
'''Set up Vector DB'''
import chromadb
client = chromadb.Client()
collection = client.get_or_create_collection("reguai_compliance")
client.delete_collection("reguai_compliance")
names = client.list_collections()

In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb

# 1. Assume df_chunks is your preprocessed DataFrame
#    with columns: 'text', 'heading', 'chunk_id'

# 2. Initialize embedder
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 3. Convert to lists
texts     = df_chunks['text'].tolist()
metadatas = df_chunks[['heading', 'chunk_id']].to_dict('records')
ids       = [f"chunk_{i}" for i in range(len(texts))]

# 4. Instantiate the new PersistentClient
client = chromadb.PersistentClient(path="./chroma_db")

# 5. Create or retrieve collection
collection = client.get_or_create_collection("reguai_compliance")

# 6. Embed & add to ChromaDB
embeddings = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
collection.add(ids=ids, documents=texts, embeddings=embeddings, metadatas=metadatas)

# 7. Query example
query = "What is data minimisation?"
q_emb  = embedder.encode([query])
res    = collection.query(query_embeddings=q_emb, n_results=3)
print(res['documents'], res['metadatas'])


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[['Processing of Personal Data There are a number of fundamental principles upon which the GDPR is based. These are as follows: 1. Personal data shall be: (a) processed lawfully, fairly and in a transparent manner in relation to the data subject (‘lawfulness, fairness and transparency’); (b) collected for specified, explicit and legitimate purposes and not further processed in a manner that is incompatible with those purposes; further processing for archiving purposes in the public interest, scientific or historical research purposes or statistical purposes shall, in accordance with Article 89(1), not be considered to be incompatible with the initial purposes (‘purpose limitation’); (c) adequate, relevant and limited to what is necessary in relation to the purposes for which they are processed (‘data minimisation’); (d) accurate and, where necessary, kept up to date; every reasonable step must be taken to ensure that personal data that are inaccurate, having regard to the purposes for 

In [12]:
'''test'''
df_chunks = preprocess_file('sample.txt')
print(f"Generated {len(df_chunks)} chunks.")


Generated 20 chunks.


In [13]:
# === 2) Vectorization & Storage (corrected) ===
from sentence_transformers import SentenceTransformer
import chromadb

# 2a) Embedder
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 2b) Chroma persistent client & collection
client     = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection("reguai_compliance")

# 2c) Prepare your preprocessed chunks
texts = df_chunks['text'].tolist()
ids   = [f"chunk_{i}" for i in range(len(texts))]
metas = df_chunks[['heading','chunk_id']].to_dict('records')

# 2d) Embed & add to ChromaDB
embs = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
collection.add(
    ids=ids,
    documents=texts,
    embeddings=embs,
    metadatas=metas
)

# No client.persist() needed for PersistentClient
print("✅ Chunks vectorized and stored in './chroma_db/reguai_compliance'")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Chunks vectorized and stored in './chroma_db/reguai_compliance'


In [14]:
# 3) Retrieval test
query   = "What are the GDPR principles for processing personal data?"
q_emb   = embedder.encode([query], convert_to_numpy=True)
results = collection.query(query_embeddings=q_emb, n_results=3)

docs = results['documents'][0]
md   = results['metadatas'][0]

for i,(d,m) in enumerate(zip(docs, md)):
    print(f"Chunk {i} — Section {m['heading']}:\n{d[:200]}...\n")


Chunk 0 — Section 2.4 Principles Relating to:
Processing of Personal Data There are a number of fundamental principles upon which the GDPR is based. These are as follows: 1. Personal data shall be: (a) processed lawfully, fairly and in a transpar...

Chunk 1 — Section 2.12 Addressing Compliance to the GDPR:
The following actions are undertaken to ensure that Institute for Supply Management complies at all times with the accountability principle of the GDPR: The legal basis for processing personal data is...

Chunk 2 — Section 2.6.6 Legitimate Interests:
If the processing of specific personal data is in the legitimate interests of Institute for Supply Management and is judged not to affect the rights and freedoms of the data subject in a significant w...



In [15]:
# === 3) Query & Answer with the new OpenAI client ===
from openai import OpenAI

# 3a) Initialize the OpenAI client
client = OpenAI(api_key=key)

# 3b) Your retrieval from Chroma (unchanged)
query = "What are the GDPR principles for processing personal data?"
q_emb  = embedder.encode([query], convert_to_numpy=True)
res    = collection.query(query_embeddings=q_emb, n_results=3)
docs   = res['documents'][0]
md     = res['metadatas'][0]

# 3c) Build your prompt with context
context = "\n\n".join(
    f"## [{m['heading']}] (chunk {m['chunk_id']})\n{d}"
    for d, m in zip(docs, md)
)
prompt = f"""
You are a compliance assistant. Answer using ONLY the context below.

{context}

Question: {query}

Answer:
"""

# 3d) Create a chat completion using the new client
resp = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are precise and concise."},
        {"role": "user",   "content": prompt},
    ],
    temperature=0,
)

# 3e) Print the answer
print(resp.choices[0].message.content.strip())


The GDPR principles for processing personal data are: 
1. Lawfulness, fairness and transparency: Data should be processed lawfully, fairly and transparently. 
2. Purpose limitation: Data should be collected for specified, explicit and legitimate purposes and not further processed in a manner incompatible with those purposes. 
3. Data minimisation: Data should be adequate, relevant and limited to what is necessary in relation to the purposes for which they are processed. 
4. Accuracy: Data should be accurate and, where necessary, kept up to date. 
5. Storage limitation: Data should be kept in a form which permits identification of data subjects for no longer than necessary for the purposes for which the personal data are processed.
