In [10]:
import os
from dotenv import load_dotenv
load_dotenv()

print(bool(os.getenv("OPENAI_API_KEY")))

True


In [11]:
import PyPDF2

# Open the PDF file
with open('coffee_processing.pdf', 'rb') as file:
    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(file)
    
    # Get the number of pages
    num_pages = len(pdf_reader.pages)
    print(f"Total pages: {num_pages}")
    
    # Initialize variable to store all text
    all_text = []
    
    # Extract text from all pages
    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        text = page.extract_text()
        if text:  # Only add if page has text
            all_text.append(text.strip())
    
    # Combine all text with double newline between pages
    text_document = "\n\n".join(all_text)

    text_document = text_document.replace("Explore our developer-friendly HTML  to PDF API Printed using PDFCrowd HTML  to PDF", "")
    
    # Print the combined text
    print("\n=== COMBINED TEXT ===\n")
    print(text_document)
    
    # Optionally print stats
    print(f"\n\nTotal characters: {len(text_document)}")

Total pages: 4

=== COMBINED TEXT ===

Coffee Processing Methods
A Comprehensive Guide to Post-Harvest Coffee Processing
Coffee processing began in Ethiopia over 1,000 years ago.
Introduction to Coffee Processing
Coffee processing is the method used to transform freshly picked coffee cherries into green coffee
beans ready for roasting. The processing method significantly impacts the final flavor profile of the
coffee. Different regions around the world have developed unique processing techniques based on
their climate, available resources, and traditional practices. The choice of processing method can
enhance certain flavor characteristics while suppressing others, making it one of the most critical
decisions in coffee production.
Processing begins immediately after harvest, as coffee cherries are highly perishable. The outer
fruit must be removed, and the beans must be dried to prevent spoilage. The manner in which this
is accomplished varies widely, from ancient sun-drying methods to

In [12]:
from annotated_types import doc
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=300,
    chunk_overlap=2,
    length_function=len,
    is_separator_regex=False,
)
chunks = text_splitter.create_documents([text_document])

print(f'Total no of chunks: {len(chunks)}', end='\n')

for chunk in chunks:
    print(chunk.page_content, chunk.metadata)
    print('-'*20)


Total no of chunks: 20
Coffee Processing Methods
A Comprehensive Guide to Post-Harvest Coffee Processing
Coffee processing began in Ethiopia over 1,000 years ago.
Introduction to Coffee Processing
Coffee processing is the method used to transform freshly picked coffee cherries into green coffee {}
--------------------
beans ready for roasting. The processing method significantly impacts the final flavor profile of the
coffee. Different regions around the world have developed unique processing techniques based on
their climate, available resources, and traditional practices. The choice of processing method can {}
--------------------
enhance certain flavor characteristics while suppressing others, making it one of the most critical
decisions in coffee production.
Processing begins immediately after harvest, as coffee cherries are highly perishable. The outer {}
--------------------
fruit must be removed, and the beans must be dried to prevent spoilage. The manner in which this
is accomp

In [13]:
from langchain_openai import OpenAIEmbeddings
import chromadb

# Initialize embeddings model
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Set up ChromaDB client with persistent storage
chroma_client = chromadb.PersistentClient(path="chroma_db")

# Delete existing collection if it exists (truncate)
try:
    chroma_client.delete_collection(name="my_documents_basic_chunking")
    print("Existing collection deleted")
except:
    print("No existing collection found")

# Create new collection
collection = chroma_client.create_collection(
    name="my_documents_basic_chunking",
    metadata={"description": "Document embeddings collection"}
)

Existing collection deleted


In [14]:
# Calculate embeddings in batches and prepare data
documents = []
embeddings_list = []
ids = []

# Extract all texts first
all_texts = []
for idx, chunk in enumerate(chunks):
    text = chunk["page_content"] if isinstance(chunk, dict) else chunk.page_content
    all_texts.append(text)
    ids.append(f"doc_{idx}")

# Calculate embeddings in batches
batch_size = 100  # Adjust based on your needs and API limits
total_batches = (len(all_texts) + batch_size - 1) // batch_size

print(f"Processing {len(all_texts)} documents in {total_batches} batches...")

for i in range(0, len(all_texts), batch_size):
    batch_texts = all_texts[i:i + batch_size]
    
    # Calculate embeddings for the batch
    batch_embeddings = embeddings_model.embed_documents(batch_texts)
    
    # Add to main lists
    documents.extend(batch_texts)
    embeddings_list.extend(batch_embeddings)
    
    batch_num = (i // batch_size) + 1
    print(f"Processed batch {batch_num}/{total_batches} ({len(batch_texts)} documents)")

print(f"\nTotal embeddings calculated: {len(embeddings_list)}")

# Insert all data into ChromaDB
collection.add(
    documents=documents,
    embeddings=embeddings_list,
    ids=ids
)

print(f"Successfully added {len(documents)} documents to ChromaDB")

Processing 20 documents in 1 batches...
Processed batch 1/1 (20 documents)

Total embeddings calculated: 20
Successfully added 20 documents to ChromaDB


In [15]:
# Perform similarity search
query = """What are the environmental concerns and solutions related to coffee processing"""

# Calculate query embedding
query_embedding = embeddings_model.embed_query(query)

# Search for top K similar documents
k = 10  # Number of results to return

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=k
)

# Print results
print(f"Query: '{query}'\n")
print(f"Top {k} similar documents:\n")
print("=" * 80)

for i in range(len(results['ids'][0])):
    distance = results['distances'][0][i]
    
    # Correct conversion: cosine similarity = 1 - cosine distance
    # Then convert to percentage
    cosine_similarity = 1 - distance
    confidence_percentage = cosine_similarity * 100
    
    print(f"\nRank {i + 1}:")
    print(f"ID: {results['ids'][0][i]}")
    print(f"Raw Text: {results['documents'][0][i]}")
    print(f"Cosine Distance: {distance:.4f}")
    print(f"Cosine Similarity: {cosine_similarity:.4f}")
    print(f"Confidence: {confidence_percentage:.2f}%")
    print("-" * 80)

Query: 'What are the environmental concerns and solutions related to coffee processing'

Top 10 similar documents:


Rank 1:
ID: doc_4
Raw Text: of production and the environmental impact of coffee farming.
The global coffee industry processes approximately 10 million tons of coffee cherries annually, with
processing methods varying by region and producer size. Small-scale farmers often rely on
Cosine Distance: 0.6920
Cosine Similarity: 0.3080
Confidence: 30.80%
--------------------------------------------------------------------------------

Rank 2:
ID: doc_12
Raw Text: systems and eco-pulping technologies to reduce water consumption by up to 90 percent while
maintaining quality standards.
Natural Processing Method
Natural processing, the oldest coffee processing method, requires minimal equipment and no water
Cosine Distance: 0.8693
Cosine Similarity: 0.1307
Confidence: 13.07%
--------------------------------------------------------------------------------

Rank 3:
ID: doc_0
Raw Text