# Arabic Poems Dataset Preparation
This notebook loads the `arbml/ashaar` dataset from Hugging Face and creates embeddings using OpenAI's `text-embedding-3-small` model, then stores them in ChromaDB.

## Features:
- Resume from last checkpoint (RESUME_FROM_LAST)
- Batch processing for efficiency
- Text truncation for long poems (MAX_TEXT_LENGTH)
- Metadata storage (poet, meter, theme, era, location)


In [None]:
# Install required packages (uncomment if needed)
# !pip install datasets openai chromadb tqdm python-dotenv


In [None]:
import os
from datasets import load_dataset
from openai import OpenAI
import chromadb
from tqdm import tqdm
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv()

# Configuration
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
BATCH_SIZE = 100
MAX_TEXT_LENGTH = 6000  # Maximum text length in characters
RESUME_FROM_LAST = True  # Resume from last checkpoint

# Initialize OpenAI client
openai_client = OpenAI(api_key=OPENAI_API_KEY)

print("‚úÖ Configuration loaded successfully")


In [None]:
# Load the arbml/ashaar dataset from Hugging Face
print("üìö Loading arbml/ashaar dataset...")
dataset = load_dataset("arbml/ashaar", split="train")
print(f"‚úÖ Dataset loaded successfully")
print(f"üìä Total poems: {len(dataset):,}")


In [None]:
# Explore dataset structure
print("Dataset columns:")
print(dataset.column_names)
print("\nFirst example:")
print(dataset[0])


In [None]:
# Initialize ChromaDB
print("üóÑÔ∏è Initializing ChromaDB...")
chroma_client = chromadb.PersistentClient(path="./arabic_poems_db")

# Get or create collection
collection = chroma_client.get_or_create_collection(
    name="arabic_poems",
    metadata={"description": "Arabic poems from arbml/ashaar dataset"}
)

existing_count = collection.count()
print(f"‚úÖ ChromaDB initialized")
print(f"üìä Existing poems in collection: {existing_count:,}")


In [None]:
def get_embedding(text: str) -> list:
    """
    Get embedding for text using OpenAI's text-embedding-3-small model.
    Truncates text if it exceeds MAX_TEXT_LENGTH.
    """
    # Truncate text if too long
    if len(text) > MAX_TEXT_LENGTH:
        text = text[:MAX_TEXT_LENGTH]
    
    try:
        response = openai_client.embeddings.create(
            model="text-embedding-3-small",
            input=text
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return None


def process_verses(verses) -> str:
    """
    Process verses - handles both string and list formats.
    """
    if isinstance(verses, list):
        return "\n".join(verses)
    return str(verses) if verses else ""


print("‚úÖ Helper functions defined")


In [None]:
# Determine starting point
start_idx = 0
if RESUME_FROM_LAST and existing_count > 0:
    start_idx = existing_count
    print(f"üîÑ Resuming from index {start_idx:,}")
else:
    print("üÜï Starting from the beginning")

# Calculate batches
total_poems = len(dataset)
remaining = total_poems - start_idx
num_batches = (remaining + BATCH_SIZE - 1) // BATCH_SIZE

print(f"üìä Total poems: {total_poems:,}")
print(f"üìä Remaining to process: {remaining:,}")
print(f"üìä Number of batches: {num_batches:,}")


In [None]:
# Process poems and create embeddings
print(f"\nüîÑ Starting embeddings calculation and storing in ChromaDB...")

for batch_idx in tqdm(range(num_batches), desc="Processing poems"):
    batch_start = start_idx + (batch_idx * BATCH_SIZE)
    batch_end = min(batch_start + BATCH_SIZE, total_poems)
    
    ids = []
    documents = []
    embeddings = []
    metadatas = []
    
    for i in range(batch_start, batch_end):
        poem = dataset[i]
        
        # Extract poem data - using arbml/ashaar column names
        poem_id = f"poem_{i}"
        poem_title = poem.get("poem title", "")
        poem_text = process_verses(poem.get("poem verses", ""))
        poet_name = poem.get("poet", "Unknown")
        poem_meter = poem.get("poem meter", "")
        poem_theme = poem.get("poem theme", "")
        poet_era = poem.get("poet era", "")
        poet_location = poem.get("poet location", "")
        
        # Create full text for embedding
        full_text = f"{poem_title}\n{poem_text}"
        
        if not full_text.strip():
            continue
        
        # Get embedding
        embedding = get_embedding(full_text)
        if embedding is None:
            continue
        
        ids.append(poem_id)
        documents.append(full_text[:MAX_TEXT_LENGTH])
        embeddings.append(embedding)
        metadatas.append({
            "poet": poet_name,
            "title": poem_title,
            "poem_meter": poem_meter,
            "poem_theme": poem_theme,
            "poet_era": poet_era,
            "poet_location": poet_location
        })
    
    # Add batch to ChromaDB
    if ids:
        collection.add(
            ids=ids,
            documents=documents,
            embeddings=embeddings,
            metadatas=metadatas
        )
    
    # Small delay to avoid rate limiting
    time.sleep(0.1)

print(f"\n‚úÖ Completed!")
print(f"üìä Total poems stored: {collection.count():,}")


In [None]:
# Verify the collection
print("üîç Verifying collection...")
print(f"üìä Total poems in collection: {collection.count():,}")

# Sample query
sample = collection.peek(limit=3)
print("\nüìù Sample poems:")
for i, meta in enumerate(sample['metadatas']):
    print(f"  {i+1}. {meta.get('title', 'No title')} - {meta.get('poet', 'Unknown')}")


In [None]:
# Test semantic search
print("üîç Testing semantic search...")

test_query = "ÿßŸÑÿ≠ÿ® ŸàÿßŸÑÿ∫ÿ≤ŸÑ"
query_embedding = get_embedding(test_query)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

print(f"\nSearch results for: '{test_query}'")
for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
    print(f"\n{i+1}. {meta.get('title', 'No title')}")
    print(f"   Poet: {meta.get('poet', 'Unknown')}")
    print(f"   Meter: {meta.get('poem_meter', 'Unknown')}")
    print(f"   Preview: {doc[:100]}...")
