In [5]:
# -------------------------
# Imports
# -------------------------
import nltk
import tiktoken
import re
import pandas as pd
from typing import List

# -------------------------
# Configuration
# -------------------------
CHUNK_SIZE = 100
CHUNK_OVERLAP = 20
ENCODING_NAME = "cl100k_base"
TEXT_COLUMN = "customer_feedback"   # change if needed

# -------------------------
# Initialize
# -------------------------
nltk.download("punkt")
sent_tokenizer = nltk.sent_tokenize
encoder = tiktoken.get_encoding(ENCODING_NAME)

# -------------------------
# Utility Functions
# -------------------------
def clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def count_tokens(text: str) -> int:
    return len(encoder.encode(text))

# -------------------------
# Chunk Builder
# -------------------------
def build_chunks(text: str) -> List[str]:
    sentences = sent_tokenizer(clean_text(text))
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = count_tokens(sentence)

        # Very long sentence
        if sentence_tokens > CHUNK_SIZE:
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            chunks.append(sentence)
            current_chunk = []
            current_tokens = 0
            continue

        if current_tokens + sentence_tokens <= CHUNK_SIZE:
            current_chunk.append(sentence)
            current_tokens += sentence_tokens
        else:
            chunks.append(" ".join(current_chunk))

            # overlap
            overlap_tokens = encoder.encode(" ".join(current_chunk))[-CHUNK_OVERLAP:]
            overlap_text = encoder.decode(overlap_tokens)

            current_chunk = [overlap_text, sentence]
            current_tokens = count_tokens(" ".join(current_chunk))

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# -------------------------
# Stratified Sampling
# -------------------------
def create_stratified_sample(csv_path, sample_size, stratify_col="Product", random_state=42):
    df = pd.read_csv(csv_path)
    frac = sample_size / len(df)

    return (
        df.groupby(stratify_col, group_keys=False)
        .apply(lambda x: x.sample(frac=frac, random_state=random_state))
        .reset_index(drop=True)
    )

# -------------------------
# Load & Prepare Data
# -------------------------
DATA_PATH = r"C:\Users\hakimam\Desktop\pproject\test5\data\processed\filtered_feedback.csv"

df_sampled = create_stratified_sample(DATA_PATH, sample_size=200)

complaints = df_sampled[TEXT_COLUMN].dropna().tolist()

# -------------------------
# Apply Chunking
# -------------------------
all_chunks = []

for i, complaint in enumerate(complaints):
    chunks = build_chunks(complaint)

    for j, chunk in enumerate(chunks):
        all_chunks.append({
            "complaint_id": f"COMP{i+1}",
            "chunk_index": j,
            "chunk_text": chunk,
            "token_count": count_tokens(chunk)
        })

chunks_df = pd.DataFrame(all_chunks)

# -------------------------
# Preview
# -------------------------
chunks_df.head()


ModuleNotFoundError: No module named 'tiktoken'

In [14]:
# Hybrid: Sentence-Based + Fixed-Length (Token) + Overlap Chunking
import nltk
import re
from typing import List

# -------------------------
# Configuration
# -------------------------
CHUNK_SIZE = 100       # max tokens per chunk
CHUNK_OVERLAP = 20     # overlapping tokens
ENCODING_NAME = "cl100k_base"

# -------------------------
# Initialize
# -------------------------

# -------------------------
# Utility Functions
# -------------------------
def clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def count_tokens(text: str) -> int:
    return len(encoder.encode(text))

# -------------------------
# Chunk Builder
# -------------------------
def build_chunks(text: str) -> List[str]:
    sentences = sent_tokenizer(clean_text(text))
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = count_tokens(sentence)

        if sentence_tokens > CHUNK_SIZE:
            if current_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_tokens = 0
            chunks.append(sentence)
            continue

        if current_tokens + sentence_tokens <= CHUNK_SIZE:
            current_chunk.append(sentence)
            current_tokens += sentence_tokens
        else:
            chunks.append(" ".join(current_chunk))

            # handle overlap
            overlap_text = " ".join(current_chunk)
            overlap_tokens = encoder.encode(overlap_text)[-CHUNK_OVERLAP:]
            overlap_text = encoder.decode(overlap_tokens)

            current_chunk = [overlap_text, sentence]
            current_tokens = count_tokens(" ".join(current_chunk))

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# -------------------------
# Sample Texts
# -------------------------
import pandas as pd

def create_stratified_sample(input_file, sample_size, stratify_col='Product', random_state=42):
    # Load the data
    df = pd.read_csv(r"C:\Users\hakimam\Desktop\pproject\test5\data\processed\filtered_feedback.csv")
    
    # Calculate fraction per group
    total_rows = len(df)
    frac = sample_size / total_rows
    
    # Stratified sampling
    df_sampled = df.groupby(stratify_col, group_keys=False).apply(
        lambda x: x.sample(frac=frac, random_state=random_state)
    ).reset_index(drop=True)
    
    return df_sampled

# -------------------------
# Apply Chunking
# -------------------------


# -------------------------
# Display Results
# -------------------------
for c in all_chunks:
    print(f"feedback {c['feedback_id']} - Chunk {c['chunk_index']} ({c['token_count']} tokens):\n{c['chunk_text']}\n")


In [None]:
# Sample chunks from previous step
chunk_texts = [
    "I was charged an annual fee for my Platinum Visa card even though the promotion said it would be waived for the first year...",
    "Customer service keeps transferring me between departments without providing a clear resolution...",
    "Additionally, the online portal does not reflect my latest transactions correctly, which makes it difficult to verify my charges...",
    # ... all other chunks
]

metadata = [
    {"feedback_id": "COMP1", "chunk_index": 0, "product_category": "Credit Cards"},
    {"feedback_id": "COMP1", "chunk_index": 1, "product_category": "Credit Cards"},
    {"feedback_id": "COMP1", "chunk_index": 2, "product_category": "Credit Cards"},
    # ... all other chunk metadata
]


# FAISS + SentenceTransformers Embeddings

✅ Free, fast, works locally
✅ Good for POC and moderate-scale datasets

In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Initialize model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode chunks
embeddings = model.encode(chunk_texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print(f"FAISS index contains {index.ntotal} vectors")

# Example: Semantic search
query = "Why are customers unhappy with BNPL?"
query_vec = model.encode([query]).astype("float32")
D, I = index.search(query_vec, k=3)

for i in I[0]:
    print(metadata[i]["chunk_index"], chunk_texts[i])


# ChromaDB + SentenceTransformer

In [None]:
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

# -------------------------
# Sample chunks and metadata
# -------------------------
chunk_texts = [
    "I was charged an annual fee for my Platinum Visa card even though the promotion said it would be waived for the first year...",
    "Customer service keeps transferring me between departments without providing a clear resolution...",
    "Additionally, the online portal does not reflect my latest transactions correctly, which makes it difficult to verify my charges..."
]

metadata = [
    {"complaint_id": "COMP1", "chunk_index": 0, "product_category": "Credit Cards"},
    {"complaint_id": "COMP1", "chunk_index": 1, "product_category": "Credit Cards"},
    {"complaint_id": "COMP1", "chunk_index": 2, "product_category": "Credit Cards"}
]

# -------------------------
# Initialize ChromaDB (New API)
# -------------------------
client = chromadb.Client(Settings(
    persist_directory="./chroma_db",   # local storage
    anonymized_telemetry=False          # optional, disables telemetry
))

# Create collection (or get if exists)
collection_name = "complaints"
collections = client.list_collections()
if collection_name in [c.name for c in collections]:
    collection = client.get_collection(name=collection_name)
else:
    collection = client.create_collection(name=collection_name)

# -------------------------
# SentenceTransformer embeddings
# -------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunk_texts, show_progress_bar=True)
embeddings = [emb.tolist() for emb in embeddings]  # Chroma requires list of floats

# -------------------------
# Add chunks to collection
# -------------------------
for i, emb in enumerate(embeddings):
    collection.add(
        documents=[chunk_texts[i]],
        embeddings=[emb],
        metadatas=[metadata[i]],
        ids=[f"chunk_{i}"]
    )

print(f"✅ Added {len(chunk_texts)} chunks to ChromaDB collection '{collection_name}'")




In [None]:
# -------------------------
# Query example
# -------------------------
query = "Why are customers frustrated with credit cards?"
query_emb = model.encode([query])[0].tolist()

results = collection.query(
    query_embeddings=[query_emb],
    n_results=2,
    where={"product_category": "Credit Cards"}
)

print("Query Results:")
for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print(f"- {meta['complaint_id']} / Chunk {meta['chunk_index']}: {doc}")

In [None]:
# -------------------------
# Add chunks to vector store
# -------------------------
for i, emb in enumerate(embeddings):
    collection.add(
        documents=[chunk_texts[i]],
        embeddings=[emb],
        metadatas=[metadata[i]],
        ids=[f"chunk_{i}"]
    )

print(f"✅ Indexed {len(chunk_texts)} chunks into ChromaDB collection '{collection_name}'")

# -------------------------
# Example semantic query
# -------------------------
query = "Why are customers frustrated with credit cards?"
query_emb = model.encode([query])[0].tolist()

results = collection.query(
    query_embeddings=[query_emb],
    n_results=2,
    where={"product_category": "Credit Cards"}  # optional filter
)

print("Top search results:")
for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print(f"- {meta['complaint_id']} / Chunk {meta['chunk_index']}: {doc}")

### 1. Load the full CFPB complaint dataset.


In [None]:
import pandas as pd
import re

# Load raw complaints
df = pd.read_csv("../data/raw/complaints-2025.csv")

df.head()
df.info()
df.columns

In [None]:
df.isnull().sum()

In [None]:
# Classify Columns by Importance
#'''critical_columns (Must NOT be null)'''
#'''Consumer complaint narrative --❗ Drop rows where this is null
#Complaint ID, product,  date received'''
# If narrative is null → no chunking, no embedding
#'''Important Metadata (Can be null): Used for filtering, trends, clustering, but not required.'''
#'''Sub-product', 'Issue', 'Sub-issue', 'State',  'Tags', 'Consumer consent provided?',  'Date sent to company',   'Consumer disputed?''' 
#so fill with unknown
# others are Useful for dashboards, not semantic meaning.(Not for Embeddings)
#'''Only embed rows with a non-null Consumer complaint narrative'''

### 2. Cleaning data set

Drop rows with null complaint text

In [None]:
#Drop rows with null complaint text
df = df.dropna(subset=["Consumer complaint narrative"])


In [None]:
# Define the Cleaning Function
import re
import unicodedata
import pandas as pd

def clean_and_normalize_text(text):
    # Handle None / NaN safely
    if text is None:
        return ""
    if isinstance(text, float) and pd.isna(text):
        return ""

    text = str(text)

    # Normalize unicode (smart quotes, etc.)
    text = unicodedata.normalize("NFKC", text)

    # Remove HTML tags
    text = re.sub(r"<[^>]+>", " ", text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)

    # Remove boilerplate / disclaimers
    text = re.sub(r"this message is confidential.*", " ", text, flags=re.I)
    text = re.sub(r"please do not reply to this email.*", " ", text, flags=re.I)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()


In [None]:
# Clean & normalize complaint text

df["Consumer complaint narrative"] = (
    df["Consumer complaint narrative"]
    .astype(str)
    .apply(clean_and_normalize_text)
)

# Remove empty text after cleaning
df = df[df["Consumer complaint narrative"].str.strip().astype(bool)]


In [None]:
# verify the result
print(df["Consumer complaint narrative"].head())
print(f"Remaining complaints: {len(df)}")


In [None]:
# Sentence Splitting:  split complaints into sentences which is best for semantic chunking.
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

def sentence_split(text):
    return sent_tokenize(text)

df["sentences"] = df["Consumer complaint narrative"].apply(sentence_split)


In [None]:
# Tokenization (Token Statistics): to measure real LLM token sizes.
import tiktoken

encoder = tiktoken.get_encoding("cl100k_base")

def token_count(text):
    return len(encoder.encode(text))


In [None]:
# Analyze Sentence Lengths: helps decide chunk size scientifically, not by guessing.
sentence_tokens = []

for sentences in df["sentences"]:
    for s in sentences:
        sentence_tokens.append(token_count(s))

import numpy as np

print("Avg tokens per sentence:", int(np.mean(sentence_tokens)))
print("95th percentile:", int(np.percentile(sentence_tokens, 95)))


In [None]:
# Chunk Complaints (Sentence-Based + Overlap)
def chunk_sentences(sentences, chunk_size, overlap):
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        tokens = token_count(sentence)

        if current_tokens + tokens <= chunk_size:
            current_chunk.append(sentence)
            current_tokens += tokens
        else:
            # save chunk
            chunks.append(" ".join(current_chunk))

            # overlap
            overlap_tokens = encoder.encode(" ".join(current_chunk))[-overlap:]
            overlap_text = encoder.decode(overlap_tokens)

            current_chunk = [overlap_text, sentence]
            current_tokens = token_count(" ".join(current_chunk))

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


In [None]:
CHUNK_SIZE = 120
CHUNK_OVERLAP = 25


In [None]:
# Apply Chunking

chunk_rows = []

for _, row in df.iterrows():
    chunks = chunk_sentences(
        row["sentences"],
        CHUNK_SIZE,
        CHUNK_OVERLAP
    )

    for i, chunk in enumerate(chunks):
        chunk_rows.append({
            "complaint_id": row["Complaint ID"],
            "product": row["Product"],
            "issue": row["Issue"],
            "state": row["State"],
            "date_received": row["Date received"],
            "chunk_index": i,
            "chunk_text": chunk
        })


In [None]:
# save chunks to DataFrame
import pandas as pd
chunk_df = pd.DataFrame(chunk_rows)
print("Total chunks:", len(chunk_df))


In [None]:
# Index into ChromaDB (Vector Store)
# Initialize ChromaDB

import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

client = chromadb.Client(Settings(
    persist_directory="./chroma_db",
    anonymized_telemetry=False
))

collection = client.get_or_create_collection("complaints")


# Embedding

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(
    chunk_df["chunk_text"].tolist(),
    show_progress_bar=True
).tolist()


## Add to Vector Store

In [None]:
BATCH_SIZE = 5000  # Safe under ChromaDB limit


In [None]:
from math import ceil

total = len(chunk_df)
num_batches = ceil(total / BATCH_SIZE)

print(f"Indexing {total} chunks in {num_batches} batches...")

for i in range(num_batches):
    start = i * BATCH_SIZE
    end = min(start + BATCH_SIZE, total)

    batch_docs = chunk_df["chunk_text"].iloc[start:end].tolist()
    batch_embeddings = embeddings[start:end]
    batch_metadata = (
        chunk_df
        .drop(columns=["chunk_text"])
        .iloc[start:end]
        .to_dict("records")
    )
    batch_ids = [f"chunk_{j}" for j in range(start, end)]

    collection.add(
        documents=batch_docs,
        embeddings=batch_embeddings,
        metadatas=batch_metadata,
        ids=batch_ids
    )

    print(f"✅ Batch {i+1}/{num_batches} indexed ({end-start} vectors)")


In [None]:
# Test Semantic Search

#Encodes the query: Converts the question into a dense semantic vector
query = "Why are customers unhappy with credit cards?"
query_embedding = model.encode([query])[0].tolist()
# Queries ChromaDB with a filtered by product type
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5,
    where={"product": "Credit card"}
)



In [None]:
# Prints top 5 most similar chunks
for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print(meta["complaint_id"], "→", doc[:150], "...")

In [None]:
#Print full chunk (easy)
for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print(meta["complaint_id"], "→", doc)


## Different stratagies

### 1. Fixed-Length Chunking (Character-Based)

In [None]:
df_8k = df.sample(n=8000, random_state=42)  # random_state for reproducibility
df_clean_8k = df_8k.copy()


In [None]:
import os

output_dir = "data/processed"
os.makedirs(output_dir, exist_ok=True)


In [None]:
output_path = os.path.join(output_dir, "complaints_8k.csv")
df_8k.to_csv(output_path, index=False)


In [None]:
print(f"Saved {len(df_8k)} records to {output_path}")


In [None]:
import pandas as pd

# df is your original DataFrame
one_third = df.iloc[:len(df) // 3]

# save as new data
one_third.to_csv("df_one_third.csv", index=False)

df_one_third = pd.read_csv("df_one_third.csv")

In [None]:


def fixed_length_chunking(df, chunk_size=500):
    chunks = []

    for _, row in df.iterrows():
        text = row["Consumer complaint narrative"]
        if pd.isna(text):
            continue

        for i in range(0, len(text), chunk_size):
            chunks.append({
                "id": f"{row['Complaint ID']}_fixed_{i}",
                "text": text[i:i+chunk_size],
                "metadata": {
                    "Complaint ID": row["Complaint ID"],
                    "Product": row["Product"],
                    "strategy": "fixed_length"
                }
            })
    return chunks


fixed_chunks_df = fixed_length_chunking(df_8k , chunk_size=500)
fixed_chunks_df


### 2. Sentence-Based Chunking

In [None]:
def sentence_based_chunking(df, max_chars=500):
    chunks = []

    for _, row in df.iterrows():
        text = row["Consumer complaint narrative"]
        if pd.isna(text):
            continue

        sentences = sent_tokenize(text)
        chunk_text = ""
        chunk_id = 0

        for sent in sentences:
            if len(chunk_text) + len(sent) <= max_chars:
                chunk_text += " " + sent
            else:
                chunks.append({
                    "id": f"{row['Complaint ID']}_sentence_{chunk_id}",
                    "text": chunk_text.strip(),
                    "metadata": {
                        "Complaint ID": row["Complaint ID"],
                        "Product": row["Product"],
                        "strategy": "sentence"
                    }
                })
                chunk_text = sent
                chunk_id += 1

        if chunk_text:
            chunks.append({
                "id": f"{row['Complaint ID']}_sentence_{chunk_id}",
                "text": chunk_text.strip(),
                "metadata": {
                    "Complaint ID": row["Complaint ID"],
                    "Product": row["Product"],
                    "strategy": "sentence"
                }
            })

    return chunks
sentence_chunks_df = sentence_based_chunking(df_8k , max_chars=500)
sentence_chunks_df

### 3. Recursive Chunking

In [None]:
def recursive_chunking(df, max_chars=500):
    chunks = []

    for _, row in df.iterrows():
        text = row["Consumer complaint narrative"]
        if pd.isna(text):
            continue

        paragraphs = text.split("\n\n")
        chunk_id = 0

        for para in paragraphs:
            if len(para) <= max_chars:
                chunks.append({
                    "id": f"{row['Complaint ID']}_recursive_{chunk_id}",
                    "text": para.strip(),
                    "metadata": {
                        "Complaint ID": row["Complaint ID"],
                        "Product": row["Product"],
                        "strategy": "recursive"
                    }
                })
                chunk_id += 1
            else:
                sentences = sent_tokenize(para)
                current = ""

                for sent in sentences:
                    if len(current) + len(sent) <= max_chars:
                        current += " " + sent
                    else:
                        chunks.append({
                            "id": f"{row['Complaint ID']}_recursive_{chunk_id}",
                            "text": current.strip(),
                            "metadata": {
                                "Complaint ID": row["Complaint ID"],
                                "Product": row["Product"],
                                "strategy": "recursive"
                            }
                        })
                        current = sent
                        chunk_id += 1

                if current:
                    chunks.append({
                        "id": f"{row['Complaint ID']}_recursive_{chunk_id}",
                        "text": current.strip(),
                        "metadata": {
                            "Complaint ID": row["Complaint ID"],
                            "Product": row["Product"],
                            "strategy": "recursive"
                        }
                    })
                    chunk_id += 1

    return chunks
recursive_chunks_df = recursive_chunking(df_8k, max_chars=500)
recursive_chunks_df

### 4. Semantic Chunking

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download("punkt")

from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions

# load embeding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)


In [None]:
'''
import pandas as pd
import numpy as np
from nltk import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # nice progress bar

# Example: embedding_model must support batch encoding
# embedding_model.encode(["list", "of", "sentences"], batch_size=64)

def fast_semantic_chunking(df, embedding_model, similarity_threshold=0.75, batch_size=64):
    """
    Fast semantic chunking using batch embeddings and vectorized operations.
    
    Args:
        df: DataFrame with 'Consumer complaint narrative' and 'Complaint ID'.
        embedding_model: Model with a .encode() method supporting batch input.
        similarity_threshold: Threshold to merge sentences into chunks.
        batch_size: Batch size for embedding computation.
    
    Returns:
        Pandas DataFrame with semantic chunks and metadata.
    """
    all_chunks = []

    texts = df["Consumer complaint narrative"].fillna("").tolist()
    ids = df["Complaint ID"].tolist()
    products = df["Product"].tolist()

    for text, cid, product in tqdm(zip(texts, ids, products), total=len(texts)):
        if not text.strip():
            continue

        sentences = sent_tokenize(text)
        if not sentences:
            continue

        # Compute embeddings in batches
        embeddings = []
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i+batch_size]
            batch_embeddings = embedding_model.encode(batch)
            embeddings.extend(batch_embeddings)
        embeddings = np.array(embeddings, dtype=np.float32)

        current_text = sentences[0]
        current_embedding = embeddings[0]
        chunk_id = 0

        for i in range(1, len(sentences)):
            similarity = cosine_similarity(
                current_embedding.reshape(1, -1),
                embeddings[i].reshape(1, -1)
            )[0][0]

            if similarity >= similarity_threshold:
                current_text += " " + sentences[i]
                current_embedding = np.mean([current_embedding, embeddings[i]], axis=0)
            else:
                all_chunks.append({
                    "id": f"{cid}_semantic_{chunk_id}",
                    "text": current_text.strip(),
                    "Complaint ID": cid,
                    "Product": product,
                    "strategy": "semantic"
                })
                current_text = sentences[i]
                current_embedding = embeddings[i]
                chunk_id += 1

        # Append last chunk
        all_chunks.append({
            "id": f"{cid}_semantic_{chunk_id}",
            "text": current_text.strip(),
            "Complaint ID": cid,
            "Product": product,
            "strategy": "semantic"
        })

    return pd.DataFrame(all_chunks)
    '''


In [59]:
import pandas as pd
import numpy as np
from nltk import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def fast_semantic_chunking(
    df,
    embedding_model,
    similarity_threshold=0.75,
    batch_size=64
):
    """
    Fast semantic chunking using batch embeddings and vectorized operations.

    Returns:
        List[Dict] — directly compatible with add_to_chroma
    """

    all_chunks = []

    texts = df["Consumer complaint narrative"].fillna("").tolist()
    ids = df["Complaint ID"].tolist()
    products = df["Product"].tolist()

    for text, cid, product in tqdm(zip(texts, ids, products), total=len(texts)):
        if not text.strip():
            continue

        sentences = sent_tokenize(text)
        if not sentences:
            continue

        # ---- Batch embedding ----
        embeddings = []
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i + batch_size]
            batch_embeddings = embedding_model.encode(batch)
            embeddings.extend(batch_embeddings)

        embeddings = np.asarray(embeddings, dtype=np.float32)

        # ---- Chunk construction ----
        current_text = sentences[0]
        current_embedding = embeddings[0]
        chunk_id = 0

        for i in range(1, len(sentences)):
            similarity = cosine_similarity(
                current_embedding.reshape(1, -1),
                embeddings[i].reshape(1, -1)
            )[0][0]

            if similarity >= similarity_threshold:
                current_text += " " + sentences[i]
                current_embedding = np.mean(
                    [current_embedding, embeddings[i]], axis=0
                )
            else:
                all_chunks.append({
                    "id": f"{cid}_semantic_{chunk_id}",
                    "text": current_text.strip(),
                    "metadata": {
                        "Complaint ID": cid,
                        "Product": product,
                        "strategy": "semantic"
                    }
                })

                current_text = sentences[i]
                current_embedding = embeddings[i]
                chunk_id += 1

        # ---- Final chunk ----
        all_chunks.append({
            "id": f"{cid}_semantic_{chunk_id}",
            "text": current_text.strip(),
            "metadata": {
                "Complaint ID": cid,
                "Product": product,
                "strategy": "semantic"
            }
        })

    return all_chunks


In [None]:
#  embedding_model is SentenceTransformer 
chunks_df = fast_semantic_chunking(df, embedding_model, similarity_threshold=0.75, batch_size=64)
# print(chunks_df.head())


### Create ChromaDB

In [40]:
client = chromadb.Client()


In [50]:
def add_to_chroma(collection_name, chunks, batch_size=5000):
    """
    Add chunks to a Chroma collection in batches to avoid max batch size errors.
    """
    collection = client.get_or_create_collection(
        name=collection_name,
        embedding_function=embedding_function
    )

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        collection.add(
            ids=[c["id"] for c in batch],
            documents=[c["text"] for c in batch],
            metadatas=[c["metadata"] for c in batch]
        )

    return collection


**Build All Indexes**

In [51]:
fixed_collection = add_to_chroma(
    "fixed_length",
    fixed_length_chunking(df_8k , chunk_size=500)
)

In [53]:
sentence_collection = add_to_chroma(
    "sentence_based",
    sentence_based_chunking(df_8k , max_chars=500)
)

In [55]:


recursive_collection = add_to_chroma(
    "recursive",
    recursive_chunking(df_8k, max_chars=500)
)


In [None]:
semantic_collection = add_to_chroma(
    "semantic",
    fast_semantic_chunking(df_8k, embedding_model, similarity_threshold=0.75)
)


**Query  Strategy**

In [None]:
def query_collection(collection, query, top_k=5):
    results = collection.query(
        query_texts=[query],
        n_results=top_k
    )

    return pd.DataFrame({
        "chunk": results["documents"][0],
        "Complaint ID": [m["Complaint ID"] for m in results["metadatas"][0]],
        "Product": [m["Product"] for m in results["metadatas"][0]],
        "strategy": [m["strategy"] for m in results["metadatas"][0]]
    })
query = "Why are customers unhappy with credit cards?"


In [None]:
# Fixed-Length Query
fixed_results = query_collection(
    fixed_collection,
    query
)
fixed_results


In [None]:

# Sentence-Based Query

sentence_results = query_collection(
    sentence_collection,
    query
)
sentence_results


In [None]:

# Recursive Query
recursive_results = query_collection(
    recursive_collection,
    query
)
recursive_results


In [None]:

# Semantic Query
semantic_results = query_collection(
    semantic_collection,
    query
)
semantic_results


In [None]:
# Side-by-Side Comparison
comparison_df = pd.concat(
    [
        fixed_results,
        sentence_results,
        recursive_results,
        semantic_results
    ],
    ignore_index=True
)

comparison_df


In [None]:
# if you want Query only Credit Card complaints:
semantic_collection.query(
    query_texts=[query],
    n_results=5,
    where={"Product": "Credit card"}
)
