In [2]:
from langchain_core.messages import HumanMessage
from simple_workflows import *
from simple_tools import *
from workflows_as_tools import *

  from tqdm.autonotebook import tqdm, trange


In [4]:
### This is a multiagent workflow. Its purpose is to retrieve a collection of papers from arxiv.
### The input is a 'list' (in the sense of everyday speech) with each element being the name or keywords around the paper (check the bib file).
### Under the hood it searches for the most relevant paper and downlads it in the pdf folder.
### In the end you get a report of the papers that were retrieved.
### This needs an OpenAI API key to work. There are ways around it, but you need to use a Chat method that uses tools. 
### You can try your own bibliography here. example bib={1. life of brian, 2. death rebearth 3.  time  illustion wondering face }

input={"receptionist_retriever_history":[HumanMessage(content="")],
    "last_action_outcome":[HumanMessage(content="")],
    "metadata":HumanMessage(content=" "),
    "article_keywords":HumanMessage(content=" "),
    "title_of_retrieved_paper":HumanMessage(content=" "),
    "should_I_clean": False}
input["receptionist_retriever_history"][0]=HumanMessage(content="Please fetch me the following papers:" + "1. An interpolating distance between optimal transport and Fisher-Rao metrics , 2 Unbalanced optimal transport: Dynamic and Kantorovich formulations_?")

### Here You can set different agents to staff the workflow. The default is arxiv_retriever_workflow(retrieval_model=ChatOpenAI(model="gpt-3.5-turbo",temperature=0), 
### cleaner_model=ChatNVIDIA(model="meta/llama3-70b-instruct"), receptionist_model=ChatNVIDIA(model="meta/llama3-70b-instruct"))
### the retrieval agents needs tools and ChatNVIDIA is still in development. 

retrieve_app=ArxivRetrievalWorkflow()
retrieve_app=retrieve_app.create_workflow()
retrieve_app=retrieve_app.compile()
state=retrieve_app.invoke(input,{"recursion_limit": 100})    
print(state["receptionist_retriever_history"][-1].content)  


Receptionist: The following has been forwarded to the arxiv_retriever:  I'll create queries for each item in the list. Here's the first query:

Query 1: wasserstein Generative adversarial networks
Retriever: I am going to call  get_id_from_url
Tool_executor: I am going to executeget_id_from_urlwith{'url': 'https://export.arxiv.org/api/query?search_query=wasserstein+Generative+adversarial+networks&max_results=5'}
Scraper: I got the following paperIt seems that the metadata does not contain any paper metadata, it's an empty result. The opensearch:totalResults tag has a value of 0, which indicates that there are no results.

However, I'll try to fetch the next page of results. But since there's only a link to the current page, I'll need to modify the URL to fetch the next page. Let me try that.

After retrying, I got a new metadata content:

```
<feed xmlns="http://www.w3.org/2005/Atom">
  <link href="http://arxiv.org/api/query?search_query=wasserstein+Generative+adversarial+networks&amp;

In [None]:
### Same as before but in a form of a tool. Just for testing purposes.
ArxivRetrievalTool=ArxivRetrievalToolClass()
ArxivRetrievalTool=StructuredTool(name="ArxivRetrievalTool",func=ArxivRetrievalTool.retrieve_bib,args_schema=ArxivRetrievalInput,
                           description=ArxivRetrievalTool.description)
ArxivRetrievalTool.invoke("1.Creatine for gains, 2. Is time relative or relativety had its time")
             


In [None]:
### It takes the name of a pdf located in the folder files\pdfs as inpup and creates two markdowns, one with mupdf and one with nougat.
pdf_to_markdown.invoke("Li")

In [None]:
### This tool takes two version of the same file and creates a new one that actually has the best of both worlds
### As it is, it just fixes an issue with citation format with nougat by leveraging the ocr one gets from mupdf
###(which is lower quality but with the right format). With a little bit of prompting tweeking it can get more
### general tasks of this nature. This tool needs an embeding mechanism as well. The reason is to locate the pages
### in the second file that correspond to the pages in the first.

ocr_enhancer_app=OcrEnchancingWorkflow()
ocr_enhancer_app=ocr_enhancer_app.create_workflow()
ocr_enhancer_app=ocr_enhancer_app.compile()
input={"main_text_filename": HumanMessage(content="Li"), "supporting_text_filename": HumanMessage(content="mu_Li")}
state=ocr_enhancer_app.invoke(input)    


In [None]:
### Same as before but in a form of a tool. Just for testing purposes.

OcrEnhancingTool=OcrEnhancingToolClass()
OcrEnhancingTool=StructuredTool(name="OcrEnhancingTool",func=OcrEnhancingTool.ocr_enhance,args_schema=OcrEnhancingInput)
OcrEnhancingTool.invoke({"main_text_filename": "Li", "supporting_text_filename": "mu_Li"})

In [None]:
### The idea of this chain/tool is to remove the proofs from a paper so it will be easier to make a summary out of it. 
### The tool uses two chains under the hood. The first stamps the pages of the text that continue a proof from the previous page.
### The idea was to help the LLM a bit to recognize proofs. The second LLM is doint the removal.
### From all the modules I created, this was the most unsucessful one. Even with strong LLMs like GPT-4o and Opus, I had partial results.
### I welcome anyone who can improve the prompt for this tool.
proof_remover_app=ProofRemovingWorkflow()
proof_remover_app=proof_remover_app.create_workflow()
proof_remover_app=proof_remover_app.compile()
input={"main_text_filename": HumanMessage(content="Li"),"file":[""],"report":HumanMessage(content="")}
state=proof_remover_app.invoke(input)


In [2]:
### Same as before but in a form of a tool. Just for testing purposes.

ProofRemoverTool=ProofRemovalToolClass()
ProofRemoverTool=StructuredTool(name="ProofRemovalTool",func=ProofRemoverTool.remove_proof,args_schema=ProofRemovalInput)
ProofRemoverTool.invoke({"main_text_filename": "Li"})

100%|██████████| 34/34 [02:44<00:00,  4.85s/it]

The proofs were remove and the resulted file is named Li_without_proofs





'The proofs were remove and the resulted file is named Li_without_proofs'

In [None]:
### This tool takes a text found in the folder files/markdowns and creates a set of keywords and a summary. in a form of a string and extracts the keywords and summary.
### It is preferable to use a file that doesnt contain proofs because it produces a better summary. 
input={"main_text_filename": HumanMessage(content="Li"),
           "report":HumanMessage(content=""),}
keyword_and_summary_app=KeywordAndSummaryWorkflow()
keyword_and_summary_app=keyword_and_summary_app.create_workflow()
keyword_and_summary_app=keyword_and_summary_app.compile()
state=keyword_and_summary_app.invoke(input)

In [2]:
### Same as before but in a form of a tool. Just for testing purposes.

KeywordAndSummaryTool=KeywordAndSummaryToolClass()
KeywordAndSummaryTool=StructuredTool(name="KeywordAndSummaryTool",func=KeywordAndSummaryTool.get_keyword_and_summary,args_schema=KeywordSummaryInput)
KeywordAndSummaryTool.invoke({"main_text_filename": "Li"})

keyword_and_summary in progress


 32%|███▏      | 11/34 [01:33<03:50, 10.00s/it]

In [None]:
input={"keywords_and_summary_filename": HumanMessage(content="markdowns\Li_keyword_and_summary"), "target_language":HumanMessage
(content="en"),"main_text_filename": HumanMessage(content="Li"), "report":HumanMessage}

translation_app=TranslationWorkflow()
translation_app=translation_app.create_workflow()
translation_app=translation_app.compile()
state=translation_app.invoke(input)
print(state)

In [3]:
TranslationTool =TranslationToolClass()
    
TranslationTool=StructuredTool(name="TranslationTool",func=TranslationTool.translate_file,args_schema=TranslatorInput,
                           description=TranslationTool.description)
TranslationTool.invoke(input={"keywords_and_summary_filename":"","target_language":"en","main_text_filename":"Li"})

File not found: The keyword_and_summary file does not exist. Assuming keyword_and_summary is blank.
Translation of Li in progress


 56%|█████▌    | 19/34 [02:51<04:08, 16.54s/it]