# Install dependencies

# Set up ChromaDB

In [2]:
import chromadb

client = chromadb.PersistentClient(path="../db/chroma")

In [3]:
collection_name = "text_documents"

In [4]:
# Delete collection if it exists (optional)
try:
    client.delete_collection(name=collection_name)
except:
    pass

In [5]:
from datetime import datetime

collection = client.create_collection(
    name=collection_name, 
    metadata={
        "description": "text documents collection",
        "created": str(datetime.now())
    }  
)

# Set up Text Processing Functions

In [6]:
import os
import glob
import uuid
import tiktoken

# Initialize the tokenizer for token counting
tokenizer = tiktoken.get_encoding("cl100k_base")  # Using OpenAI's tokenizer for approximation

def count_tokens(text):
    """Count the number of tokens in a text string"""
    return len(tokenizer.encode(text))

def split_text_by_tokens(text, max_tokens=300, overlap_tokens=100):
    """Split text into chunks with a maximum token count and optional overlap"""
    # First, tokenize the entire text
    tokens = tokenizer.encode(text)
    
    chunks = []
    chunk_start = 0
    
    while chunk_start < len(tokens):
        # Get chunk end position
        chunk_end = min(chunk_start + max_tokens, len(tokens))
        
        # Get the tokens for this chunk and decode back to text
        chunk_tokens = tokens[chunk_start:chunk_end]
        chunk_text = tokenizer.decode(chunk_tokens)
        
        # Add the chunk to our list
        chunks.append(chunk_text)
        
        # Move to the next position with overlap consideration
        chunk_start += max_tokens - overlap_tokens
        # Avoid getting stuck in an infinite loop
        if chunk_start >= chunk_end:
            chunk_start = chunk_end
            
    return chunks

def load_txt_file(file_path):
    """Load content from a text file"""
    with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
        return file.read()

In [7]:
from langchain_ollama import OllamaEmbeddings

embedding_model = "nomic-embed-text"

embeddings = OllamaEmbeddings(
    model=embedding_model,
)

def embedding_generation(text):
    """Generate embedding for a text chunk"""
    return embeddings.embed_query(text)

# Process Text Files and Generate Embeddings

In [8]:
def process_text_files(directory_path):
    """Process all text files in a directory, split into chunks, and generate embeddings"""
    # Get all .txt files in the directory
    txt_files = glob.glob(os.path.join(directory_path, "*.txt"))
    
    all_chunks = []
    
    # Process each file
    for file_path in txt_files:
        file_name = os.path.basename(file_path)
        print(f"Processing {file_name}")
        
        # Load the text content
        text_content = load_txt_file(file_path)
        
        # Split into chunks
        chunks = split_text_by_tokens(text_content)
        print(f"  Split into {len(chunks)} chunks")
        
        # Process each chunk
        for i, chunk in enumerate(chunks):
            chunk_id = str(uuid.uuid4())
            embedding = embedding_generation(chunk)
            
            chunk_data = {
                "id": chunk_id,
                "source_file": file_name,
                "chunk_index": i,
                "text": chunk,
                "token_count": count_tokens(chunk),
                "embedding": embedding
            }
            
            all_chunks.append(chunk_data)
    
    print(f"Processed a total of {len(all_chunks)} chunks from {len(txt_files)} files")
    return all_chunks

In [9]:
def add_chunks_to_collection(chunks, collection):
    """Add processed chunks to the ChromaDB collection"""
    # Add chunks in batches to avoid potential issues with large datasets
    batch_size = 100
    
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        
        # Extract the data needed for the collection
        ids = [chunk["id"] for chunk in batch]
        documents = [chunk["text"] for chunk in batch]
        embeddings_list = [chunk["embedding"] for chunk in batch]
        metadatas = [
            {
                "source_file": chunk["source_file"],
                "chunk_index": chunk["chunk_index"],
                "token_count": chunk["token_count"]
            } for chunk in batch
        ]
        
        # Add to collection
        collection.add(
            ids=ids,
            documents=documents,
            embeddings=embeddings_list,
            metadatas=metadatas
        )
        
        print(f"Added batch {i//batch_size + 1} to collection")

In [10]:
# Save processed chunks to a file
import json

def save_processed_chunks(chunks, output_path):
    """Save processed chunks to a JSON file"""
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save the data
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(chunks, file, ensure_ascii=False, indent=2)
    
    print(f"Saved processed chunks to {output_path}")

# Main Execution

In [11]:
# Directory containing text files
txt_directory = "../data/txt_files"
output_file = "../embeddings/text_embeddings.json"

In [12]:
# Process all text files
processed_chunks = process_text_files(txt_directory)

Processing 180824_Medellin_105x190_ES_ConvBureau.txt
  Split into 58 chunks
Processing Medelln.txt
  Split into 13 chunks
Processed a total of 71 chunks from 2 files


In [13]:
# Save processed chunks
save_processed_chunks(processed_chunks, output_file)

Saved processed chunks to ../embeddings/text_embeddings.json


In [14]:
# Add chunks to ChromaDB collection
add_chunks_to_collection(processed_chunks, collection)

Added batch 1 to collection


In [15]:
# Verify the number of documents in the collection
print(f"Total documents in collection: {collection.count()}")

Total documents in collection: 71


# Testing Retrieval

In [16]:
def query_similar_chunks(query_text, n_results=10):
    """Query the collection for chunks similar to the query text"""
    query_embedding = embedding_generation(query_text)
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
    )
    
    return results

In [None]:
# Test query - replace with your own query
test_query = "Api"
results = query_similar_chunks(test_query)

# Display results
for i, (doc, metadata, distance) in enumerate(zip(results['documents'][0], results['metadatas'][0], results['distances'][0])):
    print(f"\nResult {i+1} (similarity: {1-distance:.4f})")
    print(f"Source: {metadata['source_file']}, Chunk: {metadata['chunk_index']}")
    print(f"Preview: {doc[:200]}...")


Result 1 (similarity: 0.4674)
Source: Medelln.txt, Chunk: 0
Preview: Guía de Medellín

Medellín, ubicada dentro del departamento de Antioquia, es una ciudad dinámica que ha sufrido
una transformación cultural, social y económica en los últimos años.
Fundada hace más de...

Result 2 (similarity: 0.4448)
Source: 180824_Medellin_105x190_ES_ConvBureau.txt, Chunk: 5
Preview:  desconectado.

2
Introducción

7

Greater Medellín Convention
and Visitors Bureau
“La Ciudad más Innovadora del Mundo” en 2013 y el “Mejor Destino
Turístico de Sudamérica” en 2018, son algunos de lo...

Result 3 (similarity: 0.4030)
Source: 180824_Medellin_105x190_ES_ConvBureau.txt, Chunk: 8
Preview: , la Catedral
es el principal templo de la Arquidiócesis de Medellín, declarada
Monumento Nacional de Colombia
el 12 de marzo de 1982. Se dice
que es la iglesia más grande del
mundo construida en ladr...

Result 4 (similarity: 0.4000)
Source: 180824_Medellin_105x190_ES_ConvBureau.txt, Chunk: 31
Preview: acido en Medellín