# Dr. X's Publications Analyzer - Jupyter Notebook

## Import necessary libraries

In [1]:
import os
import time
import tempfile
import re
from typing import List, Dict, Tuple, Optional, Union, Any
import csv
import json
import logging
from pathlib import Path

# File handling
import docx
import PyPDF2
import pandas as pd
from io import BytesIO

# Vector DB
import chromadb
from chromadb.utils.embedding_functions import OllamaEmbeddingFunction

# NLP and embeddings
import nltk
from tiktoken import get_encoding
from sentence_transformers import CrossEncoder

# LLM interaction
import ollama
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# Evaluation 
from rouge import Rouge
import torch

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize tokenizer
cl100k_tokenizer = get_encoding("cl100k_base")

## Define system prompts

In [2]:

TRANSLATION_PROMPT = """
You are a professional translator. Translate the following text from {source_language} to {target_language}.
Maintain the original structure, formatting, and technical terminology as much as possible.
Here is the text to translate:

{text}
"""

SUMMARIZATION_PROMPT = """
You are an expert summarizer. Create a concise summary of the following text that captures the main ideas,
key findings and important details. The summary should be about {summary_length} of the original text.

Text to summarize:
{text}
"""

QA_PROMPT = """
You are an AI assistant tasked with providing detailed answers based solely on the given context. Your goal is to analyze the information provided and formulate a comprehensive, well-structured response to the question.

Context: {context}
Question: {question}
Previous Question (if any): {previous_question}
Previous Answer (if any): {previous_answer}

To answer the question:
1. Thoroughly analyze the context, identifying key information relevant to the question.
2. Take into account the previous question and answer if available to maintain conversation coherence.
3. Organize your thoughts and plan your response to ensure a logical flow of information.
4. Formulate a detailed answer that directly addresses the question, using only the information provided in the context.
5. If the context doesn't contain sufficient information to fully answer the question, state this clearly in your response.

Important: Base your entire response solely on the information provided in the context. Do not include any external knowledge or assumptions not present in the given text.
"""


## Define the TokenCounter class

In [3]:

class TokenCounter:
    """Utility class to count tokens and measure processing speed."""
    
    def __init__(self):
        self.start_time = None
        self.end_time = None
        self.token_count = 0
        
    def start_counting(self):
        """Start the timer for performance measurement."""
        self.start_time = time.time()
        self.token_count = 0
        
    def add_tokens(self, text: str):
        """Count tokens in the provided text."""
        tokens = cl100k_tokenizer.encode(text)
        self.token_count += len(tokens)
        
    def end_counting(self) -> dict:
        """End timing and return performance metrics."""
        self.end_time = time.time()
        elapsed_time = self.end_time - self.start_time
        tokens_per_second = self.token_count / elapsed_time if elapsed_time > 0 else 0
        
        return {
            "elapsed_time_seconds": elapsed_time,
            "total_tokens": self.token_count,
            "tokens_per_second": tokens_per_second
        }


## Define the TextExtractor class


In [4]:
class TextExtractor:
    """Extract text from various file formats."""
    
    @staticmethod
    def extract_from_docx(file_content: bytes) -> str:
        """Extract text from a .docx file."""
        doc = docx.Document(BytesIO(file_content))
        full_text = []
        
        # Extract text from paragraphs
        for para in doc.paragraphs:
            full_text.append(para.text)
            
        # Extract text from tables
        for table in doc.tables:
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    row_text.append(cell.text)
                full_text.append(" | ".join(row_text))
                
        return "\n".join(full_text)
    
    @staticmethod
    def extract_from_pdf(file_content: bytes) -> List[Dict[str, Union[str, int]]]:
        """Extract text from a PDF file with page numbers."""
        pdf_file = BytesIO(file_content)
        reader = PyPDF2.PdfReader(pdf_file)
        pages = []
        
        for i, page in enumerate(reader.pages):
            text = page.extract_text()
            pages.append({
                "page_number": i + 1,
                "content": text
            })
            
        return pages
    
    @staticmethod
    def extract_from_excel(file_content: bytes, file_extension: str) -> str:
        """Extract text from Excel files (.xlsx, .xls, .xlsm)."""
        df = pd.read_excel(BytesIO(file_content), sheet_name=None)
        full_text = []
        
        for sheet_name, sheet_df in df.items():
            full_text.append(f"Sheet: {sheet_name}")
            # Convert DataFrame to string representation
            full_text.append(sheet_df.to_string(index=True, header=True))
            
        return "\n\n".join(full_text)
    
    @staticmethod
    def extract_from_csv(file_content: bytes) -> str:
        """Extract text from CSV files."""
        csv_file = BytesIO(file_content)
        csv_reader = csv.reader(csv_file.read().decode('utf-8').splitlines())
        rows = list(csv_reader)
        
        # Format CSV data as text
        text_rows = []
        for row in rows:
            text_rows.append(" | ".join(row))
            
        return "\n".join(text_rows)
    
    @classmethod
    def extract_text(cls, file_path: str) -> Dict[str, Any]:
        """Extract text from a file based on its extension."""
        with open(file_path, 'rb') as file:
            file_content = file.read()
            
        file_extension = os.path.splitext(file_path)[1].lower()
        file_name = os.path.basename(file_path)
        
        if file_extension == '.docx':
            return {
                "text": cls.extract_from_docx(file_content),
                "pages": [{"page_number": 1, "content": cls.extract_from_docx(file_content)}],
                "source": file_name
            }
        elif file_extension == '.pdf':
            pages = cls.extract_from_pdf(file_content)
            return {
                "text": "\n".join([page["content"] for page in pages]),
                "pages": pages,
                "source": file_name
            }
        elif file_extension in ['.xlsx', '.xls', '.xlsm']:
            return {
                "text": cls.extract_from_excel(file_content, file_extension),
                "pages": [{"page_number": 1, "content": cls.extract_from_excel(file_content, file_extension)}],
                "source": file_name
            }
        elif file_extension == '.csv':
            return {
                "text": cls.extract_from_csv(file_content),
                "pages": [{"page_number": 1, "content": cls.extract_from_csv(file_content)}],
                "source": file_name
            }
        else:
            raise ValueError(f"Unsupported file extension: {file_extension}")

## Define the TextChunker class


In [5]:
class TextChunker:
    """Break down texts into smaller, manageable parts."""
    
    def __init__(self, 
                 chunk_size: int = 400, 
                 chunk_overlap: int = 100,
                 separators: List[str] = ["\n\n", "\n", ".", "?", "!", " ", ""]):
        
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=separators
        )
        
    def chunk_document(self, document: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Chunk a document into smaller parts using cl100k_base tokenizer.
        
        Args:
            document: Dictionary containing document text and metadata
            
        Returns:
            List of chunked documents with metadata
        """
        chunks = []
        chunk_counter = 0
        
        # Process each page
        for page in document["pages"]:
            page_number = page["page_number"]
            page_content = page["content"]
            
            # Create a Document object for the text splitter
            doc = Document(
                page_content=page_content,
                metadata={
                    "source": document["source"],
                    "page": page_number
                }
            )
            
            # Split the document
            split_docs = self.text_splitter.split_documents([doc])
            
            # Add chunk number to each split document
            for split_doc in split_docs:
                chunk_counter += 1
                split_doc.metadata["chunk"] = chunk_counter
                chunks.append({
                    "text": split_doc.page_content,
                    "metadata": split_doc.metadata
                })
                
        return chunks


## Define the VectorDatabase class

In [6]:

class VectorDatabase:
    """Create and manage vector database for document chunks."""
    
    def __init__(self, db_path: str = "./vector_db", collection_name: str = "dr_x_publications"):
        self.db_path = db_path
        self.collection_name = collection_name
        
        # Create embedding function using Ollama
        self.embedding_function = OllamaEmbeddingFunction(
            url="http://localhost:11434/api/embeddings",
            model_name="nomic-embed-text:latest"
        )
        
        # Initialize ChromaDB client
        self.client = chromadb.PersistentClient(path=db_path)
        
        # Get or create collection
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=self.embedding_function,
            metadata={"hnsw:space": "cosine"}
        )
        
        self.token_counter = TokenCounter()
        
    def add_documents(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Add document chunks to the vector database.
        
        Args:
            chunks: List of document chunks with text and metadata
            
        Returns:
            Dictionary containing performance metrics
        """
        documents, metadatas, ids = [], [], []
        
        self.token_counter.start_counting()
        
        for idx, chunk in enumerate(chunks):
            text = chunk["text"]
            metadata = chunk["metadata"]
            
            # Generate unique ID
            chunk_id = f"{metadata['source']}_{metadata['page']}_{metadata['chunk']}"
            
            documents.append(text)
            metadatas.append(metadata)
            ids.append(chunk_id)
            
            self.token_counter.add_tokens(text)
            
        # Add chunks to collection
        self.collection.upsert(
            documents=documents,
            metadatas=metadatas,
            ids=ids
        )
        
        return self.token_counter.end_counting()
    
    def query(self, query_text: str, n_results: int = 5) -> Tuple[List[str], List[Dict]]:
        """
        Query the vector database for relevant chunks.
        
        Args:
            query_text: The query text
            n_results: Number of results to return
            
        Returns:
            Tuple containing lists of documents and their metadata
        """
        self.token_counter.start_counting()
        self.token_counter.add_tokens(query_text)
        
        results = self.collection.query(
            query_texts=[query_text],
            n_results=n_results
        )
        
        performance = self.token_counter.end_counting()
        logger.info(f"Query performance: {performance}")
        
        return results["documents"][0], results["metadatas"][0]


## Define the LanguageModel class

In [7]:

class LanguageModel:
    """Interface with local LLMs for various NLP tasks."""
    
    def __init__(self, model_name: str = "llama3.2:latest"):
        self.model_name = model_name
        self.token_counter = TokenCounter()
        self.previous_question = None
        self.previous_answer = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
    def generate_answer(self, context: str, question: str) -> Tuple[str, Dict[str, Any]]:
        """
        Generate an answer to a question based on context.
        
        Args:
            context: The context for answering the question
            question: The question to answer
            
        Returns:
            Tuple containing the answer and performance metrics
        """
        self.token_counter.start_counting()
        
        # Add context and question to token count
        self.token_counter.add_tokens(context)
        self.token_counter.add_tokens(question)
        
        # Format prompt with previous Q&A if available
        prompt = QA_PROMPT.format(
            context=context,
            question=question,
            previous_question=self.previous_question if self.previous_question else "",
            previous_answer=self.previous_answer if self.previous_answer else ""
        )
        
        # Call Ollama
        response = ollama.generate(
            model=self.model_name,
            prompt=prompt
        )
        
        answer = response['response']
        
        # Update previous Q&A for context in next query
        self.previous_question = question
        self.previous_answer = answer
        
        # Count tokens in the response
        self.token_counter.add_tokens(answer)
        
        return answer, self.token_counter.end_counting()
    
    def translate_text(self, text: str, source_language: str, target_language: str) -> Tuple[str, Dict[str, Any]]:
        """
        Translate text between languages.
        
        Args:
            text: Text to translate
            source_language: Source language
            target_language: Target language
            
        Returns:
            Tuple containing translated text and performance metrics
        """
        self.token_counter.start_counting()
        
        prompt = TRANSLATION_PROMPT.format(
            source_language=source_language,
            target_language=target_language,
            text=text
        )
        
        self.token_counter.add_tokens(prompt)
        
        # Call Ollama
        response = ollama.generate(
            model=self.model_name,
            prompt=prompt
        )
        
        translation = response['response']
        self.token_counter.add_tokens(translation)
        
        return translation, self.token_counter.end_counting()
    
    def summarize_text(self, text: str, summary_ratio: float = 0.3) -> Tuple[str, Dict[str, Any]]:
        """
        Summarize text.
        
        Args:
            text: Text to summarize
            summary_ratio: Ratio of summary length to original text
            
        Returns:
            Tuple containing summary and performance metrics
        """
        self.token_counter.start_counting()
        
        prompt = SUMMARIZATION_PROMPT.format(
            text=text,
            summary_length=f"{int(summary_ratio * 100)}%"
        )
        
        self.token_counter.add_tokens(prompt)
        
        # Call Ollama
        response = ollama.generate(
            model=self.model_name,
            prompt=prompt
        )
        
        summary = response['response']
        self.token_counter.add_tokens(summary)
        
        return summary, self.token_counter.end_counting()
    
    def reset_conversation_context(self):
        """Reset the conversation context."""
        self.previous_question = None
        self.previous_answer = None

## Define the DocumentAnalyzer class

In [8]:

class DocumentAnalyzer:
    """Main class for analyzing Dr. X's publications."""
    
    def __init__(self, 
                 vector_db_path: str = "./vector_db",
                 collection_name: str = "dr_x_publications",
                 llm_model: str = "llama3.2:latest"):
        
        self.text_extractor = TextExtractor()
        self.text_chunker = TextChunker()
        self.vector_db = VectorDatabase(vector_db_path, collection_name)
        self.language_model = LanguageModel(llm_model)
        self.rouge = Rouge()
        
    def process_document(self, file_path: str) -> Dict[str, Any]:
        """
        Process a document and add it to the vector database.
        
        Args:
            file_path: Path to the document
            
        Returns:
            Dictionary containing processing information
        """
        logger.info(f"Processing document: {file_path}")
        
        # Extract text from document
        document = self.text_extractor.extract_text(file_path)
        
        # Chunk the document
        chunks = self.text_chunker.chunk_document(document)
        
        # Add chunks to vector database
        performance = self.vector_db.add_documents(chunks)
        
        return {
            "file_path": file_path,
            "chunks_created": len(chunks),
            "performance": performance
        }
    
    def process_directory(self, directory_path: str) -> List[Dict[str, Any]]:
        """
        Process all supported documents in a directory.
        
        Args:
            directory_path: Path to the directory
            
        Returns:
            List of processing information for each document
        """
        results = []
        
        # Get all files in directory
        file_paths = [os.path.join(directory_path, f) for f in os.listdir(directory_path) 
                     if os.path.isfile(os.path.join(directory_path, f))]
        
        # Filter to supported file types
        supported_extensions = ['.docx', '.pdf', '.csv', '.xlsx', '.xls', '.xlsm']
        file_paths = [f for f in file_paths if os.path.splitext(f)[1].lower() in supported_extensions]
        
        # Process each file
        for file_path in file_paths:
            try:
                result = self.process_document(file_path)
                results.append(result)
            except Exception as e:
                logger.error(f"Error processing {file_path}: {str(e)}")
                results.append({
                    "file_path": file_path,
                    "error": str(e)
                })
                
        return results
    
    def answer_question(self, question: str, n_results: int = 5) -> Dict[str, Any]:
        """
        Answer a question using the RAG system.
        
        Args:
            question: The question to answer
            n_results: Number of most relevant chunks to use
            
        Returns:
            Dictionary containing the answer and related information
        """
        # Query vector database
        documents, metadatas = self.vector_db.query(question, n_results)
        
        # Re-rank documents using cross-encoder if available
        try:
            relevant_documents, relevant_indices = self.rerank_documents(question, documents)
            relevant_metadata = [metadatas[i] for i in relevant_indices]
        except:
            logger.warning("Cross-encoder reranking failed, using original ranking")
            relevant_documents = documents
            relevant_metadata = metadatas
        
        # Join documents into context
        context = "\n\n".join(relevant_documents)
        
        # Generate answer
        answer, performance = self.language_model.generate_answer(context, question)
        
        return {
            "question": question,
            "answer": answer,
            "source_documents": relevant_documents,
            "source_metadata": relevant_metadata,
            "performance": performance
        }
    
    def rerank_documents(self, question: str, documents: List[str], top_k: int = 3) -> Tuple[List[str], List[int]]:
        """
        Re-rank documents using CrossEncoder for more accurate relevance.
        
        Args:
            question: The question
            documents: List of documents to rank
            top_k: Number of top documents to return
            
        Returns:
            Tuple containing list of re-ranked documents and their indices
        """
        if not documents:
            return [], []
            
        # Create pairs of (question, document) for each document
        pairs = [[question, doc] for doc in documents]
        
        # Load cross-encoder model
        cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=self.language_model.device)
        
        # Predict scores
        scores = cross_encoder.predict(pairs)
        
        # Get indices of top-k scoring documents
        top_indices = scores.argsort()[-top_k:][::-1]
        
        # Get top documents
        top_documents = [documents[i] for i in top_indices]
        
        return top_documents, top_indices.tolist()
    
    def translate_document(self, file_path: str, source_language: str, target_language: str) -> Dict[str, Any]:
        """
        Translate a document from one language to another.
        
        Args:
            file_path: Path to the document
            source_language: Source language
            target_language: Target language
            
        Returns:
            Dictionary containing the translated document and performance metrics
        """
        # Extract text from document
        document = self.text_extractor.extract_text(file_path)
        
        translated_pages = []
        total_performance = {"elapsed_time_seconds": 0, "total_tokens": 0, "tokens_per_second": 0}
        
        # Translate each page
        for page in document["pages"]:
            translated_text, performance = self.language_model.translate_text(
                page["content"], source_language, target_language
            )
            
            translated_pages.append({
                "page_number": page["page_number"],
                "content": translated_text
            })
            
            # Accumulate performance metrics
            total_performance["elapsed_time_seconds"] += performance["elapsed_time_seconds"]
            total_performance["total_tokens"] += performance["total_tokens"]
        
        # Calculate average tokens per second
        if total_performance["elapsed_time_seconds"] > 0:
            total_performance["tokens_per_second"] = (
                total_performance["total_tokens"] / total_performance["elapsed_time_seconds"]
            )
        
        return {
            "original_file": file_path,
            "source_language": source_language,
            "target_language": target_language,
            "translated_pages": translated_pages,
            "performance": total_performance
        }
    
    def summarize_document(self, file_path: str, summary_ratio: float = 0.3) -> Dict[str, Any]:
        """
        Summarize a document.
        
        Args:
            file_path: Path to the document
            summary_ratio: Ratio of summary length to original text
            
        Returns:
            Dictionary containing the summary and evaluation metrics
        """
        # Extract text from document
        document = self.text_extractor.extract_text(file_path)
        
        # Combine all pages
        full_text = "\n\n".join([page["content"] for page in document["pages"]])
        
        # Generate summary
        summary, performance = self.language_model.summarize_text(full_text, summary_ratio)
        
        # Evaluate using ROUGE
        rouge_scores = self.rouge.get_scores(summary, full_text)
        
        return {
            "original_file": file_path,
            "summary": summary,
            "rouge_scores": rouge_scores[0],
            "performance": performance
        }
    
    def reset_conversation(self):
        """Reset the conversation context in the language model."""
        self.language_model.reset_conversation_context()


# Initialize the analyzer

In [9]:

analyzer = DocumentAnalyzer()

2025-04-13 18:47:00,865 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


## Process a single document

In [10]:

# Replace with your own file path
document_path = "Files/The_Plan_of_the_Giza_Pyramids.pdf"

In [11]:
result = analyzer.process_document(document_path)
print(f"Processed document: {document_path}")
print(f"Created {result['chunks_created']} chunks")
print(f"Performance: {result['performance']}")

2025-04-13 18:47:14,139 - INFO - Processing document: Files/The_Plan_of_the_Giza_Pyramids.pdf
2025-04-13 18:47:17,431 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-04-13 18:47:17,455 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-04-13 18:47:17,476 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-04-13 18:47:17,501 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-04-13 18:47:17,529 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-04-13 18:47:17,557 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-04-13 18:47:17,577 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-04-13 18:47:17,598 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-04-13 18:47:17,616 - INFO - HTTP Requ

Processed document: Files/The_Plan_of_the_Giza_Pyramids.pdf
Created 133 chunks
Performance: {'elapsed_time_seconds': 6.028315544128418, 'total_tokens': 11439, 'tokens_per_second': 1897.5449968178243}


## Process multiple documents from a directory

In [1]:

# Replace with your own directory path
directory_path = "Files"


results = analyzer.process_directory(directory_path)
print(f"Processed {len(results)} documents")
for result in results:
    if "error" in result:
        print(f"Error processing {result['file_path']}: {result['error']}")
    else:
        print(f"Processed {result['file_path']}: {result['chunks_created']} chunks")

NameError: name 'analyzer' is not defined

## Ask questions about the processed documents

In [23]:

question = "can glen get caught up in  stuff ?"


result = analyzer.answer_question(question, n_results=5)
print("\nQuestion:", result["question"])
print("\nAnswer:", result["answer"])
print("\nSource documents:")
for i, (doc, meta) in enumerate(zip(result["source_documents"], result["source_metadata"])):
    print(f"\n--- Document {i+1} ---")
    print(f"Source: {meta['source']}, Page: {meta['page']}, Chunk: {meta['chunk']}")
    print(doc[:150] + "..." if len(doc) > 150 else doc)
print("\nPerformance:", result["performance"])

2025-04-13 19:09:07,056 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-04-13 19:09:07,068 - INFO - Query performance: {'elapsed_time_seconds': 2.1565732955932617, 'total_tokens': 10, 'tokens_per_second': 4.636985916701271}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-13 19:09:12,549 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"



Question: can glen get caught up in  stuff ?

Answer: Based on the provided context, it appears that there is no direct information about Glen getting caught up in "stupid stuff" or any other topic related to personal behavior or interests. The text only discusses the surveys conducted by Mark Lehner and David Goodman as part of the Giza Plateau Mapping Project.

Given this lack of relevant information, I can suggest a few possibilities:

1. The question may be unrelated to the provided context, which focuses on historical archaeological surveys.
2. The context does not provide enough information to answer the question, and further clarification or context would be needed to provide a meaningful response.
3. It is possible that Glen Dash is an individual involved in the surveys or research being discussed, but without more context, it's difficult to say whether this is relevant to the question.

In light of these possibilities, I must conclude that the context does not contain suffici

# Translate a document

In [14]:

# Replace with your own file path
translate_file_path = "Files/The_Plan_of_the_Giza_Pyramids.pdf"
source_language = "English"
target_language = "Arabic"


result = analyzer.translate_document(translate_file_path, source_language, target_language)
print(f"\nTranslation from {source_language} to {target_language}:")
for page in result["translated_pages"]:
    print(f"\n--- Page {page['page_number']} ---")
    print(page["content"][:300] + "..." if len(page["content"]) > 300 else page["content"])
print("\nPerformance:", result["performance"])


2025-04-13 18:49:42,782 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-04-13 18:49:49,830 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-04-13 18:50:02,182 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-04-13 18:50:15,654 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-04-13 18:50:30,633 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-04-13 18:50:41,989 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-04-13 18:50:50,528 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-04-13 18:50:56,788 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-04-13 18:51:05,245 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-04-13 18:51:10,638 - INFO - HTTP Request:


Translation from English to Arabic:

--- Page 1 ---
Here is the translation of the text from English to Arabic:


الวางة المعمارية للقصر الصخري

1 


الวางة المعمارية للقصر الصخري 

كتاب 
من قبل John A.R. Legon 
تاريخ التعديل: 22-11-2019 
تم إجراء تقييمات مبكرة على وازن القصر الصخري للثلاثة piramides في وقت مبكر، وتمเผجتها في كتاب صدر عن مجلة جمعية S...

--- Page 2 ---
The translation is as follows:


معالجة النماذج السابقة لضمان الحفاظ على الشكل الأصلي للنص، كما يُشjet الخصائص الفنية مثل الكلمات التكنولوجية.  


يُقترحوجود نظام ضمني متجاوز للتنظيم الوضعي للممorialات الثلاث على طبة الجيزة في الأوامر الأولى من خلال المنظمة العادية لذلك الركاز الثلاث على الطفاح الجي...

--- Page 3 ---
Here is the translation of the text from English to Arabic:

النص السابق هو: 
الخططة للقلاع الجيزية 3
يُقدم أبعاد القاعدة الثلاثة للقلاع المشبعة، التي تم تحديدها من قبل بارتي ، في طbl I ، مع الاضطرابات المتوسطة في الأطوال والاتجاهات الثلاثة للمشبعة بالحرف الصحيح، مع استعمال البيانات المتوفرة في طbl ...

--

## Summarize a document

In [12]:

# Replace with your own file path
summarize_file_path = "Files/The_Plan_of_the_Giza_Pyramids.pdf"
summary_ratio = 0.3  # 30% of original length


result = analyzer.summarize_document(summarize_file_path, summary_ratio)
print("\nSummary:")
print(result["summary"])
+
print("\nROUGE Scores:")
print(json.dumps(result["rouge_scores"], indent=2))
print("\nPerformance:", result["performance"])

2025-04-12 21:09:35,928 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"



Summary:
This text is a lengthy and detailed discussion about the author's theories regarding the construction and purpose of the Great Pyramid of Giza, as well as the broader context of ancient Egyptian history.

The author presents several key points:

1. **Challenging conventional wisdom**: The author disputes the traditional view that the Great Pyramid was built during the reign of Pharaoh Khufu of the Fourth Dynasty. Instead, they propose that the pyramid may predate this dynasty.
2. **Alternative purpose**: The author suggests that the pyramids were not primarily tombs for pharaohs but rather a center for the initiation of priest-neophytes into the occult wisdom of ancient Egypt.
3. **Evidence from Egyptian records**: The author cites records preserved by the Egyptians themselves, such as the Turin King List and the history of Manetho, which mention an era known as Zep Tepi (the First Time), when Egypt was ruled by priest-initiates who founded the divine dynasties.
4. **The Grea

## Reset conversation context

In [None]:

analyzer.reset_conversation()
print("Conversation context reset")

## Display performance metrics

In [17]:

if hasattr(analyzer, "performance_metrics") and analyzer.performance_metrics:
    print("\nPerformance Metrics:")
    metrics_df = pd.DataFrame(analyzer.performance_metrics)
    display(metrics_df)
    
    # Plot tokens per second over time
    if "tokens_per_second" in metrics_df.columns:
        import matplotlib.pyplot as plt
        plt.figure(figsize=(10, 6))
        plt.plot(metrics_df["tokens_per_second"])
        plt.title("Tokens Per Second")
        plt.xlabel("Operation")
        plt.ylabel("Tokens/Second")
        plt.grid(True)
        plt.show()