In [1]:
import wat

In [2]:
import pathlib

data_dir = pathlib.Path("../data/")
document_paths = {
    'journal' : data_dir / 'journal' / 'JOURNAL_2017_AerationCostsInStirredTankAndBubbleColumnBioreactors.pdf',
    'textbook' : data_dir / 'textbook' / 'TEXT_DairyProcessingHandbook_WheyProcessingChapter15.pdf',
    'report' : data_dir / 'report' / 'REPORT_ConsultancyOnLargeScaleSubmergedAerobicCultivationProcessDesignNRELGenomatica.pdf',
    'article': data_dir / 'article' / 'REPORT_2003_OptimizePowerConsumptionInAerobicFermenters.pdf'
}
document_paths = {
    k : str(v.resolve())
    for k,v in document_paths.items()
}

In [3]:
from pyzerox import zerox
import os
import json
import asyncio

### Model Setup (Use only Vision Models) Refer: https://docs.litellm.ai/docs/providers ###

## placeholder for additional model kwargs which might be required for some models
kwargs = {}

## system prompt to use for the vision model
custom_system_prompt = None

# to override
# custom_system_prompt = "For the below pdf page, do something..something..." ## example

###################### Supported Models and ENV vars ######################
# For other providers refer: https://docs.litellm.ai/docs/providers 
models = {
    'openai' : "gpt-4o-mini", #OPENAI_API_KEY
    'gemini' : "gemini/gemini-1.5-flash", #GEMINI_API_KEY
    'anthropic' : "claude-3-5-sonnet-20241022", #ANTHROPIC_API_KEY
    'haiku' : 'claude-3-haiku-20240307',
    'sonnet' : 'claude-3-5-sonnet-20241022',
}

model = models['gemini']

###### Define main async entrypoint
async def extract_markdown(filepath, provider: str = 'haiku'):
    if provider in models.keys():
        model=models[provider]
    else:
        raise KeyError(f"provider not supported, choose from {list(models.keys())}")
    
    # KWARGS docs: https://docs.litellm.ai/docs/completion/input
    
    ## local filepath and file URL supported
    file_path = str(pathlib.Path(filepath).resolve())

    ## process only some pages or all
    select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed)

    output_dir = "./output_test" ## directory to save the consolidated markdown file
    result = await zerox(
        file_path=file_path, 
        model=model, 
        output_dir=None,
        custom_system_prompt=custom_system_prompt,
        select_pages=select_pages, 
        **kwargs
    )
    return result




* 'fields' has been removed


In [None]:
# run the main function:
document_path = document_paths['article']
result = await extract_markdown(filepath=document_path)

# print markdown result
print(result.pages[1].content)

# Extract all of them

## Anthropic haiku

In [58]:
import pymupdf  # import package PyMuPDF
import time

def get_elapsed_time(start, end):
    seconds = end - start
    minutes = round(seconds/60,2)
    return seconds, minutes

provider='haiku'
time_log = {}
time_log['total'] = {
    'start' : time.time()
}
results = {}
for doc_kind, path in document_paths.items():
    start = time.time()
    print(f"{doc_kind} : {path}")
    
    doc_result = {}
    doc=pymupdf.open(path)
    print(f"- page count: {doc.page_count}")
    try:
        result = await extract_markdown(filepath=path, provider=provider)
        doc_result['result'] = result
    except Exception as e:
        print(f"- failed to extract")
        doc_result['error'] = e
    end = time.time()
    elapsed_s, elapsed_min = get_elapsed_time(start, end)
    time_log['doc_kind'] = {
        'start' : start,
        'end' : end,
        'elapsed' : elapsed_min,
    }
    doc_result['minutes'] = elapsed_min
    results[doc_kind] = doc_result
time_log['total']['end'] = time.time()
elapsed_s, elapsed_min = get_elapsed_time(start=time_log['total']['start'], end=time_log['total']['end'])
time_log['total']['elapsed'] = elapsed_min
display(time_log)


def pages_to_markdown(pages: list, output_file: str = "output.md"):
    markdown_string = ''
    with open(output_file, "w") as f:
        for page in pages:
            # Add page number as header
            page_header = f"## Page {page.page}\n\n"
            markdown_string+=page_header
            f.write(page_header)
            
            # Add content with proper markdown line breaks
            content = page.content.replace("\n", "  \n")
            markdown_string+=content
            f.write(content)
            
            # page separator
            separator = "\n\n---\n\n"
            markdown_string+=separator
            f.write(separator) 
    return markdown_string

for doc_kind, doc_result in results.items():
    r = doc_result.get('result',None)
    if r is None:
        continue
    pages = r.pages
    markdown = pages_to_markdown(pages=pages, output_file=f"{doc_kind}-{provider}.md")



journal : /Users/nicholasgrundl/projects/ragnostic/data/journal/JOURNAL_2017_AerationCostsInStirredTankAndBubbleColumnBioreactors.pdf
- page count: 6
SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'max_tokens': 10}
_is_function_call: False
RAW RESPONSE:
{"id":"msg_01XMjgY5vTtjahoE1sgZecyN","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"I'm doing well, thanks for asking! As"}],"stop_reason":"max_tokens","stop_sequence":null,"usage":{"input_tokens":14,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":10}}






ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae70b0>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae7680>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae6ae0>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae7870>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): 

ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthrop

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    


RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.





RAW RESPONSE:
{"id":"msg_01Mf4RVsmjzSZ4szvbhEqC7o","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"# Aeration costs in stirred-tank and bubble column bioreactors\n\n## ABSTRACT\n\nTo overcome knowledge gaps in the economics of large-scale aeration for production of commodity products, Aspen Plus is used to stimulate steady-state oxygen delivery in both stirred-tank and bubble column bioreactors, using published engineering correlations for oxygen mass transfer as a function of power input, coupled with new equipment cost estimates developed in Aspen Capital Cost Estimator and validated against vendor quotations. These simulations describe the cost efficiency of oxygen delivery as a function of oxygen uptake rate and vessel size, and show that capital and operating costs for oxygen delivery drop considerably moving from small-scale (200 m³) to world-class size (500 m³) reactors. This analysis suggests bubble-column reactor systems 



RAW RESPONSE:
{"id":"msg_01PCCGNrQCSCDu2LwT6nhU4p","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"# Biochemical Engineering Journal xxx (2017) xxx-xxx\n\nFig. 2. Capital costs (uninstalled, 2014$) of STRs in 316SS as a function of agitator power.\n\nCompression power is required for larger volumes of air. The power scales slightly more favorably with volume at higher O2R, with a 10% power reduction between 500 m^3 and 1000 m^3. In either case, for large reactors at a given O2R, the total bioreactor system power consumption scales approximately linearly with total liquid volume. This implies that any beneficial economies of scale realized at larger vessel sizes are more strongly a result of reduced capital costs for the vessels and agitators than reduced operating costs from differences in power demand or cost efficiency.\n\nTo investigate the economies of scale possible for aeration, capital and operating costs were incorporated 



RAW RESPONSE:
{"id":"msg_01Adk68zAEmPTtPa8gN1Pcbw","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"```\nBiochemical Engineering Journal xxx (2017) xxx-xxx\n\n3\n\ndetermine the total system power demand for most of the users shown\nin Fig. 1, the compression, air cooler, chiller for FCR, circulation\npump, and chiller (scaled by cooling duty). The cooling tower was not\nincluded because its power consumption is insignificant compared to\nthe chiller.\nThe independent variable determining total system power was\ntaken to be the oxygen uptake rate (OUR), in an operating bioreactor,\nthe submersed culture provides some OUR, which, at steady-state, is\nequal to an oxygen transfer rate (OTR); the product of a mass transfer\ncoefficient, k\na (m h−1), times the volumetric available surface area, a, and\nan oxygen concentration driving force, (C∗ − C).\nOUR = OTR = k\na a (C∗ − C)\n\n(1)\n\nwhere k\na is usually lumped together and C∗ an



ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae70b0>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae7a60>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae7870>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae72a0>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): 

ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthrop

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x132005590>


RAW RESPONSE:
Client error '429 Too Many Requests' for url '

ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthrop

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x132004800>


RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.ge

ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthrop

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If y

ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    


RAW RESPONSE:
{"id":"msg_01Ds5zuFubAfoXJZeESCYS2g","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"# Figure 15.4: Process for defatting of whey protein concentrate (WPC)\n\n1. Pasteurizer\n2. Whey cream separator\n3. Holding tank\n4. First UF plant\n5. MF plant\n6. Second UF plant"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1066,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":65}}


Async Wrapper: Completed Call, calling async_success_handler: <bound method Logging.async_success_handler of <litellm.litellm_core_utils.litellm_logging.Logging object at 0x131b40410>>
Logging Details LiteLLM-Async Success Call, cache_hit=None
RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litell



RAW RESPONSE:
{"id":"msg_01DamrQb7JjsKbvUr98ufYwR","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"# Chapter 15\n\n## Whey processing\n\nWhey, the liquid residue of cheese and casein production, is one of the biggest reservoirs of food protein still remaining largely outside human consumption channels. World whey output, at approximately 120 million tonnes in 1980, contains some 6.7 million tonnes of relatively high-value protein, equal to the protein contents of almost 2 million tonnes of soya beans. Yet, despite the chronic protein shortage in large parts of the world, a very considerable proportion of the total whey output is still wasted - the proportion of wastage was roughly 50% in 1989-1990.\n\nWhey comprises 80-90% of the total volume of milk entering the process and contains about 50% of the nutrients in the original milk: soluble"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1066,"cache_creatio



RAW RESPONSE:
{"id":"msg_01SJiJ7RYVXt6cjh7ngPg7AJ","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"# % protein in dry matter according to the values in table 15.3:\n100 x 0.55 = 35\n\nIn concentration most of the true protein, typically > 99%, is retained together with almost 100% of the fat. The concentrations of lactose, NPN and ash are generally the same in the retentate serum and permeate as in the original whey, but a slight retention of these components is reported. The overall retention figures, however, depend very much on:\n* The type of membrane\n* The flux\n* The character of the feed (prediluted with water, pre-concentrated after demineralization, etc.)\n\n![Process for recovery of dried protein concentrate using UF.](image.png)\n\nTo obtain an 85% protein concentrate the liquid whey is first concentrated 20 – 30-fold by direct ultrafiltration to a solids content of approximately 25%; this is regarded as the maximum fo



ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae7a60>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae70b0>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x132004040>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x132005f40>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): 

ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthrop

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If y

ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthrop

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae7490>


RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

RAW RESPONSE:
Client error '429 Too Many Requests' for url '

ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthrop

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If y

ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    


RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.





RAW RESPONSE:
{"id":"msg_016Ecjf4PxkYoqWBYZVVp4NB","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"# Contact Information\n\nGenomatica, Inc.\n4757 Nexus Center Drive\nSan Diego, CA 92121\n\nJason Crater (Primary Contact)\nManager, Scale-up & Technology Transfer\nEmail: jcrater@genomatica.com\nPhone: (858) 784-1922\n\nConnor Galleher\nBioprocess Development Engineer\nEmail: cgalleher@genomatica.com\nPhone: (858) 210-4413\n\nJeff Lievense, Ph.D.\nSenior Advisor to the CEO\nEmail: jlievense@genomatica.com\nPhone: (858) 210-4451"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1180,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":149}}


Async Wrapper: Completed Call, calling async_success_handler: <bound method Logging.async_success_handler of <litellm.litellm_core_utils.litellm_logging.Logging object at 0x131b94530>>
Logging Details LiteLLM-Async Success Call, cache_hit=None




RAW RESPONSE:
{"id":"msg_01FZBFCSHJ6R9xWKNgNgPDqE","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"Online source: http://www.fool.com/investing/general/2014/01/18/5-unbelievable-but-real-technologies-made-possible.aspx\n\nJ. J. Heijnen, \"Scale up/Scale down,\" presented at the Advanced Course Bioprocess Design, Delft, Netherlands, May 2014.\n\nP. M. Doran. Bioprocess Engineering Principles, 2nd ed.,United Kingdom: Elsevier, 2013, ch. 2, pp. 160-161."}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1180,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":127}}


Async Wrapper: Completed Call, calling async_success_handler: <bound method Logging.async_success_handler of <litellm.litellm_core_utils.litellm_logging.Logging object at 0x131b6a810>>
Logging Details LiteLLM-Async Success Call, cache_hit=None




RAW RESPONSE:
{"id":"msg_011tDGMPXwW13pf995k8rJZQ","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"# Executive Summary\n\nNREL is developing an advanced aerobic bubble column model using Aspen Custom Modeler (ACM). The objective of this work is to integrate the bench-top fermentor model with existing techno-economic models in Aspen Plus and Excel to establish a new methodology for guiding process design. To assist this effort, NREL has contracted Genomatica to critique and make recommendations for improving NREL's bioreactor model and large scale aerobic bioreactor design for biologically producing lipids at commercial scale.\n\nWhile acknowledging the great work NREL has done to this point in developing a bioreactor model, Genomatica has highlighted a few areas for improving the functionality and effectiveness of the model. Genomatica recommends using a compartment model approach with an integrated black-box kinetic model of the 



ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x131ae70b0>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x132005f40>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x132004420>


ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {}
_is_function_call: False
RAW RESPONSE:
<coroutine object AnthropicChatCompletion.acompletion_function at 0x132006320>




ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: AnthropicException - {"type":"error","error":{"type":"rate_limit_error","message":"This request would exceed your organization’s rate limit of 10,000 output tokens per minute. For details, refer to: https://docs.anthropic.com/en/api/rate-limits; see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase."}}
    Please check the status of your model provider API status.
    


RAW RESPONSE:
Client error '429 Too Many Requests' for url 'https://api.anthropic.com/v1/messages'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.





RAW RESPONSE:
{"id":"msg_01Ma4NpQTZAqPv4EToP4X7M2","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"# Optimize Power Consumption in Aerobic Fermenters\n\nBy performing the necessary pilot work, and rigorously calculating the full-scale performance instead of using simple rules-of-thumb for scale-up, significant energy savings can be achieved in fermenters.\n\nAerobic fermentation was put to commercial use in the 1940s to make penicillin. Many other antibiotics were made via this method. Over the years, it has become a route to economically produce a variety of compounds, including enzymes, amino acids, vitamins, flavors, biofuels, think kening agents, and cleaning compounds. The list keeps growing.\n\nOne reason for its widespread use is its specificity for producing compounds—that is, some species are difficult to make by other means, and others may produce high concentrations of byproducts. Another reason is that fermentation req



RAW RESPONSE:
{"id":"msg_01LiHNaxmGCHhpa4JSE9UJGf","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"# Reactions and Separations\n\n## Step 5. Calculate the required k₂a.\n## Step 6. Calculate the actual volumetric air flow at the agitator impeller tip. Use the volumetric flow rate to account for temperature, composition, backpressure and liquid head. Use the average flowrates for the inlet and outlet. \n## Step 7. Calculate the superficial gas velocity at the impeller using the above as volumetric.\n## Step 8. Using the mass-transfer relationship developed from pilot scale data, solve for the agitator power. Add about 5% to account for the gear drive and seal losses to get the required motor power draw.\n## Step 9. Calculate compressor power. Include the backstage, liquid head, pressure, and losses from the sparging, piping and filtration systems. Also include the manufacturer's compression efficiency to calculate motor power draw 



RAW RESPONSE:
{"id":"msg_01X9f3tmUUzK6qkoMGr8Stw1","type":"message","role":"assistant","model":"claude-3-haiku-20240307","content":[{"type":"text","text":"# Step 2. Normally, the iteration process would start at a value 20% higher than the literature value, but for this case, it was started at 0.02, 0.06, and 0.06, respectively, based on units of g, kW/m^3, and kw/m^2. This strategy was used to converge faster. The remaining steps in this process should be derived from values stated in the actual broth and impeller system data that will be used. Based on a required k_a of 0.0649/s and a superficial gas velocity of 0.0204 m/s, the required agitator power/V = 349 W/m^3. For a batch size of 114 M^3, this gives a total installed power of 39.8 kW. Allowing for 70% power transmission efficiency through the gear drive and seal, the motor power draw required is about 41.9 kW. This figure seems low, it is because the required OTR for this problem is low, so the \"easy\" fermentation (step 9) to

{'total': {'start': 1737687084.4622562,
  'end': 1737687118.2099662,
  'elapsed': 0.56},
 'doc_kind': {'start': 1737687107.529393,
  'end': 1737687118.209742,
  'elapsed': 0.18}}

## Open AI gpt4-mini

In [59]:
import pymupdf  # import package PyMuPDF
import time

def get_elapsed_time(start, end):
    seconds = end - start
    minutes = round(seconds/60,2)
    return seconds, minutes

provider='openai'
time_log = {}
time_log['total'] = {
    'start' : time.time()
}
results = {}
for doc_kind, path in document_paths.items():
    start = time.time()
    print(f"{doc_kind} : {path}")
    
    doc_result = {}
    doc=pymupdf.open(path)
    print(f"- page count: {doc.page_count}")
    try:
        result = await extract_markdown(filepath=path, provider=provider)
        doc_result['result'] = result
    except Exception as e:
        print(f"- failed to extract")
        doc_result['error'] = e
    end = time.time()
    elapsed_s, elapsed_min = get_elapsed_time(start, end)
    time_log['doc_kind'] = {
        'start' : start,
        'end' : end,
        'elapsed' : elapsed_min,
    }
    doc_result['minutes'] = elapsed_min
    results[doc_kind] = doc_result
time_log['total']['end'] = time.time()
elapsed_s, elapsed_min = get_elapsed_time(start=time_log['total']['start'], end=time_log['total']['end'])
time_log['total']['elapsed'] = elapsed_min
display(time_log)


def pages_to_markdown(pages: list, output_file: str = "output.md"):
    markdown_string = ''
    with open(output_file, "w") as f:
        for page in pages:
            # Add page number as header
            page_header = f"## Page {page.page}\n\n"
            markdown_string+=page_header
            f.write(page_header)
            
            # Add content with proper markdown line breaks
            content = page.content.replace("\n", "  \n")
            markdown_string+=content
            f.write(content)
            
            # page separator
            separator = "\n\n---\n\n"
            markdown_string+=separator
            f.write(separator) 
    return markdown_string

for doc_kind, doc_result in results.items():
    r = doc_result.get('result',None)
    if r is None:
        continue
    pages = r.pages
    markdown = pages_to_markdown(pages=pages, output_file=f"{doc_kind}-{provider}.md")



journal : /Users/nicholasgrundl/projects/ragnostic/data/journal/JOURNAL_2017_AerationCostsInStirredTankAndBubbleColumnBioreactors.pdf
- page count: 6
SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'max_tokens': 10, 'extra_body': {}}
RAW RESPONSE:
{"id": "chatcmpl-At45ip7p0cDcorlMo9wSqNT3UJhOe", "choices": [{"finish_reason": "length", "index": 0, "logprobs": null, "message": {"content": "I'm just a program, but I'm here and ready", "refusal": null, "role": "assistant", "audio": null, "function_call": null, "tool_calls": null}}], "created": 1737687130, "model": "gpt-4o-mini-2024-07-18", "object": "chat.completion", "service_tier": "default", "system_fingerprint": "fp_72ed7ab54c", "usage": {"completion_tokens": 10, "prompt_tokens": 14, "total_tokens": 24, "completion_tokens_details": {"accepted_prediction_tokens": 0, "audio_tokens": 0, "reasoning_tokens": 0, "rejected_prediction_tokens": 0}, "prompt_tokens_details"



ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
RAW RESPONSE:
{"id": "chatcmpl-At45lRzPfoJV7oVHpQExmPkEyKPKi", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n3. Results and discussion\n\n3.1. Bioreactor capital costs\n\nTa



RAW RESPONSE:
{"id": "chatcmpl-At45lXgauNRVBC5OYY2uMt6Ge8Jfv", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n\\documentclass{article}\n\\begin{document}\n\n\\section*{Biochemical Engineering Journal xxx (2017) xxx-xxx}\n\nproduct. For chemical or fuel precursors produced aerobically at high productivity, the OTR requirements can be significant. The techno-economic trends show here are encouraging, as they indicate that high-OTR processes are more cost-efficient. Nonetheless, substantial challenges remain to making biofuels cost effectively via aerobic routes [21].\n\nThe trends established from the analysis and methodology performed here will help to define optimal operating conditions for future detailed models of aerobic processes, e.g., favoring BCRs over STRs where possible, and assessing bioreactor sizes of 500-1000 m\u00b3. For low-margin, commodity fuels and chemicals processes, BCRs are likely more cost-effective than S



ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional par



RAW RESPONSE:
{"id": "chatcmpl-At46Nq2o02XA4gFKurpOcjSIkeP7v", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Chapter 15\n\n## Whey processing\n\nWhey, the liquid residue of cheese and casein production, is one of the biggest reservoirs of food protein still remaining largely outside human consumption channels. World whey output, at approximately 120 million tonnes in 1990, contains some 0.7 million tonnes of relatively high-value protein, equal to the protein contents of almost 2 million tonnes of soya beans. Yet, despite the chronic protein shortage in large parts of the world, a very considerable proportion of the total whey output is still wasted - the proportion of wastage was roughly 50% in 1989-1990.\n\nWhey comprises 80\u201390% of the total volume of milk entering the process and contains about 50% of the nutrients in the original milk: soluble\n```\n", "refusal": null, "role": "assistant", "audio": null, "function_ca



RAW RESPONSE:
{"id": "chatcmpl-At46NH8c1BCXM1Sh2XG1wKxkaPAkn", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Whey Processing Alternatives\n\nWhey is very often diluted with water. The figures above relate to undiluted whey. As to the composition of the NPN fraction, about 30% consists of urea. The rest is amino acids and peptides (gluco macro peptide from renneting action on casein). Table 15.2 lists some fields of application for whey and whey products.\n\n## Whey\n\n- Separation\n  - Concentration of total solids\n    - Reverse osmosis\n    - Evaporation\n    - Drying\n      - Whey cream\n      - Dried whey powder\n      - Whey protein concentrates (MPC)\n      - Sweetened condensed whey\n  - Fractionation of total solids\n    - Protein recovery\n    - Lactose recovery\n    - Desalination\n    - Fermentation\n    - Lactose conversion\n      - Lactose\n      - Lactose hydrolysis\n  - Chemical reaction\n    - Lactose/galactos



RAW RESPONSE:
{"id": "chatcmpl-At46NahwhfKslcKQlofIOMaNHTSRF", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Fig. 15.7 Process line for lactose manufacture.\n1. Evaporator  \n2. Crystallisation tanks  \n3. Decanter centrifuges  \n4. Fluidised-bed dryer  \n5. Packing  \n\n## Crystallisation\nThe crystallisation cycle is determined by the following factors:\n- Crystal surface available for growth\n- Purity of the solution\n- Degree of saturation\n- Temperature\n- Viscosity\n- Agitation of the crystals in the solution\n\nSeveral of these factors are mutually related to each other, for example degree of saturation and viscosity.\n\nFigure 15.7 shows a production line for manufacture of lactose. The whey is first concentrated by evaporation to 60 \u2013 62% DM and then transferred to crystallisation tanks (2) where seed crystals are added. Crystallisation takes place slowly according to a predetermined time/temperature programme. 



RAW RESPONSE:
{"id": "chatcmpl-At46NEtwJ1Qb0DWeqtxdvx0rLgCx4", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Recovery of denatured whey protein\n\nIn general, serum protein or whey proteins cannot be precipitated by rennet or acid. It is however possible to precipitate whey proteins with acid if they are first denatured by heat. The process is divided into two stages:\n- Precipitation (denaturing) of the protein by a combination of heat treatment and pH adjustment,\n- Concentration of proteins by centrifugal separation.\n\nDenatured whey proteins can be mixed with cheese milk prior to renneting; they are then retained in the lattice structure formed by the casein molecules during coagulation. This discovery led to intensive efforts to find a method of precipitating and separating whey proteins as well as a technique for optimising the yield while retaining the characteristic aroma and texture of the cheese in question.\n\n![F



RAW RESPONSE:
{"id": "chatcmpl-At46NrxVP3qbR0mW24zyoYlcX06km", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\nash and raise the concentration of protein relative to the total dry matter. Dialfiltration is a procedure in which water is added to the feed as filtration proceeds in order to wash out low molecular components which will pass through the membranes, basically lactose and minerals.\n\nTable 15.4 shows the compositions of some typical whey protein concentrate (WPC) powders. A process line for production of dried protein using UF is shown in figure 15.3. About 95% of the whey is collected as permeate, and protein concentrations as high as 80 \u2013 85% (calculated on the DM content) can be obtained in the dried product. For further details about UF see chapter 6.4, membrane filters.\n\nDefatting of whey protein concentrate (WPC)\n\nDefatted WPC powder containing 80 \u2013 85% protein dry matter is a very interesting option



RAW RESPONSE:
{"id": "chatcmpl-At46N3HNb4diOraHKU0yKrIDed4XT", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Fig. 15.2 Fines and fat separation from whey.\n\nFat is recovered in centrifugal separators. The fines are often pressed in the same way as cheese, after which they can be used in processed cheese and, after a period of ripening, also in cooking. The whey cream, often with a fat content of 25 \u2013 30%, can be re-used in cheesemaking to standardise the cheese milk; this enables a corresponding quantity of fresh cream to be utilised for special cream products.\n\n## Cooling and pasteurisation\nWhey which is to be stored before processing must either be chilled or pasteurised as soon as the fat has been removed. For short-time storage, 10 \u2013 15 hours, cooling is usually sufficient to reduce bacterial activity. Longer periods of storage require pasteurisation of the whey.\n\n## Concentration of total solids\n\n### Co



RAW RESPONSE:
{"id": "chatcmpl-At46Nk1GL2YCYM5OnY65QyOGebuCg", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n% protein in dry matter according to the values in table 15.3:\n\n\\[\n\\frac{100 \\times 0.55}{1.57} = 35\n\\]\n\nIn concentration most of the true protein, typically > 99%, is retained together with almost 100% of the fat. The concentrations of lactose, NPN and ash are generally the same in the retentate serum and permeate as in the original whey, but a slight retention of these components is reported. The overall retention figures, however, depend very much on:\n- The type of membrane\n- The flux\n- The character of the feed (prediluted with water, pre-concentrated after demineralisation, etc.)\n\n```\n**Fig. 15.3** Process for recovery of a dried protein concentrate using UF.\n\nTo obtain an 85% protein concentrate the liquid whey is first concentrated 20 \u2013 30-fold by direct ultrafiltration to a solids content o



RAW RESPONSE:
{"id": "chatcmpl-At46NuG3HpDK0PuK4vgOuaCv7wMVo", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\nFractionation of total solids\n\nProtein recovery  \nWhey proteins were originally isolated through the use of various precipitation techniques, but nowadays membrane separation (fractionation) and chromatographic processes are used in addition to both precipitation and complexing techniques. The process that has been most extensively used for separation of whey proteins from whey serum is heat denaturation. The precipitated protein formed by this process is either insoluble or sparingly soluble depending on the conditions prevailing at denaturation; it is called heat-precipitated whey protein (HPWP).  \n\nFink and Kessler (1988) state that a maximum whey protein denaturation rate of 90% is possible for all denaturable fractions. Proteose peptone, comprising some 10% of the fraction, is considered undenaturable.  \n\nNat



RAW RESPONSE:
{"id": "chatcmpl-At46Ny2cwRFibsbOVL4UqfYQWTcL6", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Chromatographic isolation of lactoperoxidase and lactoferrin\n\nGenerally speaking, use of natural bioactive agents is of very great interest in products like infant formulas, health foods, skin creams and toothpaste. Examples of such components are the bioactive proteins lactoperoxidase (LP) and lactoferrin (LF) existing at low contents in whey, typically 20 mg/l of LP and 35 mg/l of LF. The Swedish Dairies Association (SMR) has developed a patented process based on chromatography for isolation of these proteins from cheese whey on an industrial scale.\n\nThe basic principle underlying the process is the fact that both LP and LF have isoelectric points in the alkaline pH area, 9.0 \u2013 9.5, which means that these proteins are positively charged at the normal pH of sweet whey, 6.2 \u2013 6.6, while the rest of the wh



RAW RESPONSE:
{"id": "chatcmpl-At46RZaipB3ZT4orRKsbSNaTZKp83", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n![Fig. 15.8 Decanter centrifuge.](attachment://fig_15_8_decanter_centrifuge.png)\n\n1. Feed  \n2. Outlet for liquid phase  \n3. Outlet for solids phase  \n\nDrying a thin layer of amorphous (shapeless, non-crystalline) lactose tends to form on the \u03b1-hydrate crystal, and this may later result in formation of lumps. Drying usually takes place in a fluidised bed drier. The temperature is maintained at 92\u00b0C and the drying time is 15 \u2013 20 minutes. The dried sugar is transported by air at a temperature of 30\u00b0C, which also cools the sugar. The crystals are normally ground to a powder immediately after drying and are then packed.\n\n### Refining of lactose\nA higher degree of purity is required for some applications, e.g., pharmaceutical manufacturing processes. Lactose for such use must therefore be further 



RAW RESPONSE:
{"id": "chatcmpl-At46ZMUjUAboymKueQtbBlQXwtJpt", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\nsimple example of this reaction is shown for sodium chloride removal, where R is the exchange group bound to the insoluble resin.\n\nCation exchange  \nR \u2013 H + Na+  \u21cc R \u2013 Na + H+   resin in H+ form  \n\nAnion exchange  \nR \u2013 OH + Cl\u2013 \u21cc R \u2013 Cl + OH\u2013   resin in OH\u2013 form  \n\nThe reaction above is deliberately written as an equilibrium, because the direction in which the reaction goes depends on the ion concentration in the liquid and in the solids phase of the resin. The equilibrium is characterised by a constant. On regeneration the reaction is reversed when the sodiumladen ion exchange resin is treated with, say, a 4% hydrochloric acid solution. The high concentration of hydrogen ions in the acid drives the equilibrium to the left.\n\nThe equilibrium constant varies depending 



RAW RESPONSE:
{"id": "chatcmpl-At46YzabqZRYHXqEoWicJmOZ43H3I", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Power supply and automation\nDirect current is used in the electrodialysis plant, which should have facilities for regulating current in the range of 0 \u2013 185 A and voltage in the range of 0 \u2013 400 V. Flow rates, temperatures, conductivity, pH of process water and product, product inlet pressure, pressure difference between the stacks and current, as well as voltage over each membrane stack, are monitored and controlled during production.\n\n## Limiting factors in electrodialysis\nA major limiting factor for using electrodialysis in dairy processing is the cost of replacing membranes, spacers and electrodes, which constitute 35 \u2013 40% of the total running costs in the plant. Replacement is necessary due to fouling of the membranes, which in turn is caused by:\n- Precipitation of calcium phosphate on the cat



ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional par



RAW RESPONSE:
{"id": "chatcmpl-At4743inBBufgTlfDAKtz65P4nX1s", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Contents\n\n- Executive Summary ........................................................................................................................................ 4\n- Contact Information ..................................................................................................................................... 5\n- Introduction .................................................................................................................................................. 7\n- Feedback ......................................................................................................................................................... 8\n  - Modeling Methodology ............................................................................................................................ 8\n  - Model Ass



RAW RESPONSE:
{"id": "chatcmpl-At474yh60y1h88xv4U08xvzS4Wa8D", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Contact Information\n\nGenomatica, Inc.  \n4757 Nexus Center Drive  \nSan Diego, CA 92121  \n\n**Jason Crater (Primary Contact)**  \nManager, Scale-up & Technology Transfer  \nEmail: jcrater@genomatica.com  \nPhone: (858) 784-1922  \n\n**Connor Galleher**  \nBioprocess Development Engineer  \nEmail: cgalleher@genomatica.com  \nPhone: (858) 210-4413  \n\n**Jeff Lievense, Ph.D.**  \nSenior Advisor to the CEO  \nEmail: jlievense@genomatica.com  \nPhone: (858) 210-4451  \n\n---\n\nThis report is available at no cost from the National Renewable Energy Laboratory (NREL) at www.nrel.gov/publications.\n```", "refusal": null, "role": "assistant", "audio": null, "function_call": null, "tool_calls": null}}], "created": 1737687214, "model": "gpt-4o-mini-2024-07-18", "object": "chat.completion", "service_tier": "default", "system_f



RAW RESPONSE:
{"id": "chatcmpl-At474gLgdCe6XCqlgAyyENQJTHIIH", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Consultancy on Large-Scale Submerged Aerobic Cultivation Process Design \u2013 Final Technical Report\n\n**February 1, 2016 \u2014 June 30, 2016**\n\nJason Crater, Connor Galleher,  \nand Jeff Lievense  \nGenomatica, Inc.  \nSan Diego, California  \n\nNREL Technical Monitor: James McMillan  \nPrepared under Subcontract No. AFC-6-62032-01  \n\n---\n\nNREL is a national laboratory of the U.S. Department of Energy  \nOffice of Energy Efficiency & Renewable Energy  \nOperated by the Alliance for Sustainable Energy, LLC  \n\nThis report is available at no cost from the National Renewable Energy Laboratory (NREL) at www.nrel.gov/publications.  \n\n**National Renewable Energy Laboratory**  \n15013 Denver West Parkway  \nGolden, CO 80401  \n303-275-3000 \u2022 www.nrel.gov  \n\nSubcontract Report  \nNREL/SR-5100-67963  \nMay 2



RAW RESPONSE:
{"id": "chatcmpl-At474h99Wdbleh5IxnK2oXTXrZhiL", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Consultancy on Large-Scale Submerged Aerobic Cultivation Process Design \u2013 Final Technical Report\n\n**February 1, 2016 \u2014 June 30, 2016**\n\n**Jason Crater, Connor Galleher, and Jeff Lievense**  \nGenomatica, Inc.  \nSan Diego, California  \n\nNREL Technical Monitor: James McMillan\n\nNREL is a national laboratory of the U.S. Department of Energy  \nOffice of Energy Efficiency & Renewable Energy  \nOperated by the Alliance for Sustainable Energy, LLC  \n\nThis report is available at no cost from the National Renewable Energy Laboratory (NREL) at www.nrel.gov/publications.\n\n**Subcontract Report**  \nNREL/SR-5100-67963  \nMay 2017  \n\nContract No. DE-AC36-08GO28308\n```", "refusal": null, "role": "assistant", "audio": null, "function_call": null, "tool_calls": null}}], "created": 1737687214, "model": "gpt-4o-



RAW RESPONSE:
{"id": "chatcmpl-At474XMWtmwGLJF5QB6iCILS1tsbh", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Introduction\n\nThe National Renewable Energy Laboratory\u2019s (NREL\u2019s) Biochemical Platform is developing processing strategies for producing biofuels and bio-based products from lignocellulosic feedstocks. One approach is based on using pretreatment followed by enzymatic hydrolysis to deconstruct the major plant carbohydrates, cellulose and hemicellulose, into monomeric sugars. These biomass-derived sugars are then clarified using solid-liquid separation processes prior to being concentrated and converted to products. Submerged aerobic fermentation production of intracellular lipids from biomass-derived sugars using oleaginous yeast is one of several sugar upgrading conversion routes being considered. Once recovered, the lipids can then be hydro-treated and isomerized to produce a hydrocarbon biofuel (1).\n\nNR



RAW RESPONSE:
{"id": "chatcmpl-At474U8VqXOmxpAHja6ZIbs74AcZ8", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Executive Summary\n\nNREL is developing an advanced aerobic bubble column model using Aspen Custom Modeler (ACM). The objective of this work is to integrate the new fermentor model with existing techno-economic models in Aspen Plus and Excel to establish a new methodology for guiding process design. To assist this effort, NREL has contracted Genomatica to critique and make recommendations for improving NREL\u2019s bioreactor model and large scale aerobic bioreactor design for biologically producing lipids at commercial scale.\n\nWhile acknowledging the great work NREL has done to this point in developing a bioreactor model, Genomatica has highlighted a few areas for improving the functionality and effectiveness of the model. Genomatica recommends using a compartment model approach with an integrated black-box kinetic m



RAW RESPONSE:
{"id": "chatcmpl-At474Pt6PDiGJa9WkItIQ5gPDDs2I", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n| Parameter               | Decreases Cost               | Increases Cost                        | Considerations                                                                                           |\n|------------------------|------------------------------|---------------------------------------|----------------------------------------------------------------------------------------------------------|\n| Oxygen                 | Anaerobic                    | Aerobic                               | Anaerobic fermentation eliminates oxygen transfer costs, and may require stirred tank reactors to mix.   |\n| Fermentor Volume       | Larger, fewer                | Smaller, more                        | Fabrication costs, operating mode, gradients, mixing time, process/facility complexity are all impacted. |\n| Fermen



RAW RESPONSE:
{"id": "chatcmpl-At4747BPs6knx4nFaVlkaG7ve5Xd8", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```\nThis publication was reproduced from the best available copy\nsubmitted by the subcontractor and received no editorial review at NREL.\n\nNOTICE\n\nThis report was prepared as an account of work sponsored by an agency of the United States government. Neither the United States government nor any agency thereof, nor any of their employees, makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, completeness, or usefulness of any information, apparatus, product, or process disclosed, or represents that its use would not infringe privately owned rights. Reference herein to any specific commercial product, process, or service by trade name, trademark, manufacturer, or otherwise does not necessarily constitute or imply its endorsement, recommendation, or favoring by the United States 



RAW RESPONSE:
{"id": "chatcmpl-At474LvBGsrGjRPdPOj0n4GVnfB5P", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\naid in the initial bioreactor design, but it also facilitates the design of process scale-down studies to mimic the anticipated large-scale conditions in the laboratory. See the Bioreactor Scale section (page 15) for more details. Klaas van't Riet's and Johannes Tramper's Basic Bioreactor Design provides a detailed example (chapter 18, example 18.1) on how to set up a compartment model for bubble column reactors (3). Klaas van\u2019t Riet\u2019s and Rob van der Laans\u2019 \u201cMixing in bioreactor vessels\u201d provides useful information on broth mixing and compartmentalization in bubble column and stirred tank reactors (4).\n\nIn order to maximize the effectiveness of the compartment model, a black-box kinetic model of the production microbe should also be incorporated. Linking the kinetics of the host microbe\u2019s



RAW RESPONSE:
{"id": "chatcmpl-At478oalZkTDntfMdocaKYgbyVnJe", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\nrelation. Typically, the qp(\u03bc) function takes the form of a complex, non-linear relation. Example qp(\u03bc) functions include (6):\n\nEquation 2: qp,1 = qp,max * \u03bc\n                 \u03b1 + \u03bc\n\nEquation 3: qp,2 = qp,max\n                 1 + \u03be\n                 \u03b1\n\nEquation 4: qp,3 = qp,max * \u03bc\n                 \u03b1 + \u03bc * \u03be\n                 \u03b2\n\nWhere qp,max is the maximum specific product formation rate (mmol product/g dcw/hr), and \u03b1/\u03b2 are constants used to fit actual fermentation data. See Figure 1 below for example plots of the qp(\u03bc) functions outlined above in Equations 2-4. Because it is assumed that ammonium limitation is being used to downregulate growth and upregulate TAG production, Equations 3 and 4 provide more realistic functions for NREL\u20



RAW RESPONSE:
{"id": "chatcmpl-At4741u1ygzBIzRB2ZRQpJbwhsHlD", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Feedback\n\n## Modeling Methodology\n\nNREL is developing an advanced aerobic bubble column model using ACM that will be integrated with existing techno-economic models in Aspen Plus and Excel. The fermentation model in ACM is used to dynamically simulate a single batch from inoculation to harvest. The time-dependent results from the fermentation simulation are subsequently exported to Excel for integration and calculation of steady-state rates, which are then imported into techno-economic models in Aspen Plus (2). Genomatica has employed a similar methodology in which multiple software platforms (e.g. Mathematica, Excel, Aspen) are used to assess both steady state and dynamic processes. When modeling complete processes using multiple software platforms, Genomatica prefers to use an Excel interface with all other progr



RAW RESPONSE:
{"id": "chatcmpl-At47AZCk6hI3TJ3TkHhL2jprpyNmW", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```\nThe coupling of aeration and mixing ultimately limits the practical application of bubble columns to aerobic fermentation processes with minimum levels of aerobicity around 15 mol/m\u00b3/hr. Processes requiring anaerobicity or low levels of oxygenation (<15 mol/m\u00b3/hr) would result in inadequate mixing and significant broth heterogeneity, unless an external broth recycle loop was added. There are also upper limits to the levels of oxygenation that can be achieved in bubble column reactors. The maximum achievable oxygen transfer rate is limited by the maximum operating superficial gas velocity, as liquid entrainment in the gas phase occurs with gas velocities above 0.3 m/s (17). Oxygen enrichment is another strategy that may be employed to increase oxygen transfer rates; however, the cost benefits should be carefully evaluat



RAW RESPONSE:
{"id": "chatcmpl-At47CwN3WusJ6WRf9ENjVhRoHg2Dy", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# OTR Capacity vs. Viscosity\u00b9  H\u2082O Viscosity vs. Temperature\n\n![Figure 4: Effect of broth viscosity on oxygen transfer rate capacity for bubble columns and impact of temperature on viscosity (21)](https://www.nrel.gov/publications)\n\n\u00b9 Maximum achievable OTR for 1,000 m\u00b3 bubble column gassed with air, L/D = 4, T = 35\u00b0C, P\u2091\u209c = 0.34 atm, U\u2092 = 0.3 m/s, O\u2082 depletion = 0.55% m, U* used for kLa calculation, log mean concentration driving force used for OTR calculation.\n\nAnother challenge associated with bubble column reactors is the difficulty in simulating the industrial process at lab scale (process scale-down), as the hydrodynamics of bubble columns are largely scale-dependent (17). Column geometry and hydrostatic pressure have a significant impact on mass transfer and broth



RAW RESPONSE:
{"id": "chatcmpl-At479IxbLMo7CMP4aDG4spJjo6siQ", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n# Strain Selection\n\nStrain selection will greatly influence bioreactor design, so bioreactor design/costs should guide strain selection rather than the other way around. For example, a thermotolerant yeast can dramatically increase the efficiency of bioreactor cooling, reducing associated costs (see also Bioreactor Cooling, pp16-18). On this specific point, a modification that increases yeast thermotolerance has been discovered recently; see Caspeta et al. (14). Sensitivities to carbon dioxide levels and oxygen gradients can influence the choice of fermentor aspect ratios and the practical limitations of the fermentor volume. Selecting a strain that has demonstrated robust performance under these conditions will allow for larger fermentors and realization of the associated cost advantages. Genetic stability is another 



RAW RESPONSE:
{"id": "chatcmpl-At47NHtWA0doDSol2jqjktbICYej2", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\n## References\n\n1. National Renewable Energy Laboratory, Statement of Work, \u201cConsultancy on Large-Scale Submerged Aerobic Cultivation Process Design,\u201d Dec 21, 2015.\n2. D. Humbird, R. Davis, J.D. McMillan, Aeration Costs in Stirred-Tank and Bubble Column Bioreactors, Biochemical Engineering Journal, submitted.\n3. K. van 't Riet, J. Tramper, in Basic Bioreactor Design, New York: Marcel Dekker, Inc., 1991, ch. 2.3, pp. 245-250, 294.\n4. K. van 't Riet, R. G. J. M. van der Laans, \u201cMixing in bioreactor vessels,\u201d in Comprehensive Biotechnology, 2nd ed., Amsterdam, Netherlands: Elsevier, 2011, ch. 2.07, pp. 63-80.\n5. Online source: https://en.wikipedia.org/wiki/Monod_equation\n6. J. J. Heijnen, \u201cThe process reaction for bioprocess design: a thermodynamic approach,\u201d presented at the Advanced Cou



ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Final returned optional params: {'extra_body': {}}
RAW RESPONSE:
{"id": "chatcmpl-At47hDIOMvyGHFyBpE451zrRuY7LG", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "```markdown\nReactions and Separations\n\n# Optimize Power Consumption in Aerobic Fermenters\n\n**GREGORY T. BENZ**  \nBenz Technology International, Inc.\n\nBy performing the necessary pilot work, and rigorously calculating the full-scale performance instead of using simple rules-of-thumb for scale-up, significant energy savings c

{'total': {'start': 1737687130.017079,
  'end': 1737687287.782578,
  'elapsed': 2.63},
 'doc_kind': {'start': 1737687251.0485508,
  'end': 1737687287.782393,
  'elapsed': 0.61}}

## Gemini billed

In [4]:
import pymupdf  # import package PyMuPDF
import time

def get_elapsed_time(start, end):
    seconds = end - start
    minutes = round(seconds/60,2)
    return seconds, minutes

provider='gemini'
time_log = {}
time_log['total'] = {
    'start' : time.time()
}
results = {}
for doc_kind, path in document_paths.items():
    start = time.time()
    print(f"{doc_kind} : {path}")
    
    doc_result = {}
    doc=pymupdf.open(path)
    print(f"- page count: {doc.page_count}")
    try:
        result = await extract_markdown(filepath=path, provider=provider)
        doc_result['result'] = result
    except Exception as e:
        print(f"- failed to extract")
        doc_result['error'] = e
    end = time.time()
    elapsed_s, elapsed_min = get_elapsed_time(start, end)
    time_log['doc_kind'] = {
        'start' : start,
        'end' : end,
        'elapsed' : elapsed_min,
    }
    doc_result['minutes'] = elapsed_min
    results[doc_kind] = doc_result
time_log['total']['end'] = time.time()
elapsed_s, elapsed_min = get_elapsed_time(start=time_log['total']['start'], end=time_log['total']['end'])
time_log['total']['elapsed'] = elapsed_min
display(time_log)


def pages_to_markdown(pages: list, output_file: str = "output.md"):
    markdown_string = ''
    with open(output_file, "w") as f:
        for page in pages:
            # Add page number as header
            page_header = f"## Page {page.page}\n\n"
            markdown_string+=page_header
            f.write(page_header)
            
            # Add content with proper markdown line breaks
            content = page.content.replace("\n", "  \n")
            markdown_string+=content
            f.write(content)
            
            # page separator
            separator = "\n\n---\n\n"
            markdown_string+=separator
            f.write(separator) 
    return markdown_string

for doc_kind, doc_result in results.items():
    r = doc_result.get('result',None)
    if r is None:
        continue
    pages = r.pages
    markdown = pages_to_markdown(pages=pages, output_file=f"{doc_kind}-{provider}.md")

journal : /Users/nicholasgrundl/projects/ragnostic/data/journal/JOURNAL_2017_AerationCostsInStirredTankAndBubbleColumnBioreactors.pdf
- page count: 6




textbook : /Users/nicholasgrundl/projects/ragnostic/data/textbook/TEXT_DairyProcessingHandbook_WheyProcessingChapter15.pdf
- page count: 22


ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    
ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



ERROR:root:Failed to process image Error:
    Error in Completion Response. Error: litellm.RateLimitError: litellm.RateLimitError: VertexAIException - {
  "error": {
    "code": 429,
    "message": "Resource has been exhausted (e.g. check quota).",
    "status": "RESOURCE_EXHAUSTED"
  }
}

    Please check the status of your model provider API status.
    



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



ERROR:root:Failed to process image Error:expected string or bytes-like object, got 'NoneType'


report : /Users/nicholasgrundl/projects/ragnostic/data/report/REPORT_ConsultancyOnLargeScaleSubmergedAerobicCultivationProcessDesignNRELGenomatica.pdf
- page count: 27

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

- failed to extract
article : /Users/nicholasgrundl/projects/ragnostic/data/article/REPORT_2003_OptimizePowerConsumptionInAerobicFermenters.pdf
- page count: 4

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

- failed to extract


{'total': {'start': 1737687481.858078,
  'end': 1737687511.871719,
  'elapsed': 0.5},
 'doc_kind': {'start': 1737687511.658027,
  'end': 1737687511.87159,
  'elapsed': 0.0}}