In [1]:
from langchain_core.messages import HumanMessage
from simple_workflows import *
from simple_tools import *
from workflows_as_tools import *
from dotenv import load_dotenv
import os
load_dotenv()


  from tqdm.autonotebook import tqdm, trange


True

In [2]:
### This is a multiagent workflow. Its purpose is to retrieve a collection of papers from arxiv.
### The input is a 'list' (in the sense of everyday speech) with each element being the name or keywords around the paper (check the bib file).
### Under the hood it searches for the most relevant paper and downlads it in the pdf folder.
### In the end you get a report of the papers that were retrieved.
### This needs an OpenAI API key to work. There are ways around it, but you need to use a Chat method that uses tools. 
### You can try your own bibliography here. example bib={1. life of brian, 2. death rebearth 3.  time  illustion wondering face }

input={"receptionist_retriever_history":[HumanMessage(content="")],
    "last_action_outcome":[HumanMessage(content="")],
    "metadata":HumanMessage(content=" "),
    "article_keywords":HumanMessage(content=" "),
    "title_of_retrieved_paper":HumanMessage(content=" "),
    "should_I_clean": False}
input["receptionist_retriever_history"][0]=HumanMessage(content="Please fetch me the following papers:" + "1. An interpolating distance between optimal transport and Fisher-Rao metrics , 2 Unbalanced optimal transport: Dynamic and Kantorovich formulations_?")

### Here You can set different agents to staff the workflow. The default is arxiv_retriever_workflow(retrieval_model=ChatOpenAI(model="gpt-3.5-turbo",temperature=0), 
### cleaner_model=ChatNVIDIA(model="meta/llama3-70b-instruct"), receptionist_model=ChatNVIDIA(model="meta/llama3-70b-instruct"))
### the retrieval agents needs tools and ChatNVIDIA is still in development. 

retrieve_app=ArxivRetrievalWorkflow()
retrieve_app=retrieve_app.create_workflow()
retrieve_app=retrieve_app.compile()
state=retrieve_app.invoke(input,{"recursion_limit": 100})    
print(state["receptionist_retriever_history"][-1].content)  


Receptionist: The following has been forwarded to the arxiv_retriever:  I'll create queries for each paper and send them to the retriever.

Here's the first query:

`"An interpolating distance between optimal transport and Fisher-Rao metrics"`

Please wait for the response before I proceed to the next query.
Retriever: I am going to call  get_id_from_url
Tool_executor: I am going to executeget_id_from_urlwith{'url': 'https://export.arxiv.org/api/query?search_query=An_interpolating_distance_between_optimal_transport_and_Fisher-Rao_metrics&max_results=5'}
Scraper: I got an error, going back to the arxiv_retriever
Retriever:I am reporting back to the arxiv_receptionist withNo paper found for the query. Let's try with a different query.
Reporting to receptionist
Receptionist: The following has been forwarded to the arxiv_retriever:  Here's a revised query:

`"interpolating distance between optimal transport Fisher-Rao metrics"`

Please wait for the response.
Retriever: I am going to call  

In [None]:
### Same as before but in a form of a tool. Just for testing purposes.
ArxivRetrievalTool=ArxivRetrievalToolClass()
ArxivRetrievalTool=StructuredTool(name="ArxivRetrievalTool",func=ArxivRetrievalTool.retrieve_bib,args_schema=ArxivRetrievalInput,
                           description=ArxivRetrievalTool.description)
ArxivRetrievalTool.invoke("1.Creatine for gains, 2. Is time relative or relativety had its time")
             


In [None]:
### It takes the name of a pdf located in the folder files\pdfs as inpup and creates two markdowns, one with mupdf and one with nougat.
pdf_to_markdown.invoke("Li")

In [None]:
### This tool takes two version of the same file and creates a new one that actually has the best of both worlds
### As it is, it just fixes an issue with citation format with nougat by leveraging the ocr one gets from mupdf
###(which is lower quality but with the right format). With a little bit of prompting tweeking it can get more
### general tasks of this nature. This tool needs an embeding mechanism as well. The reason is to locate the pages
### in the second file that correspond to the pages in the first.

ocr_enhancer_app=OcrEnchancingWorkflow()
ocr_enhancer_app=ocr_enhancer_app.create_workflow()
ocr_enhancer_app=ocr_enhancer_app.compile()
input={"main_text_filename": HumanMessage(content="Li"), "supporting_text_filename": HumanMessage(content="mu_Li")}
state=ocr_enhancer_app.invoke(input)    


In [None]:
### Same as before but in a form of a tool. Just for testing purposes.

OcrEnhancingTool=OcrEnhancingToolClass()
OcrEnhancingTool=StructuredTool(name="OcrEnhancingTool",func=OcrEnhancingTool.ocr_enhance,args_schema=OcrEnhancingInput)
OcrEnhancingTool.invoke({"main_text_filename": "Li", "supporting_text_filename": "mu_Li"})

In [None]:
### The idea of this chain/tool is to remove the proofs from a paper so it will be easier to make a summary out of it. 
### The tool uses two chains under the hood. The first stamps the pages of the text that continue a proof from the previous page.
### The idea was to help the LLM a bit to recognize proofs. The second LLM is doint the removal.
### From all the modules I created, this was the most unsucessful one. Even with strong LLMs like GPT-4o and Opus, I had partial results.
### I welcome anyone who can improve the prompt for this tool.
proof_remover_app=ProofRemovingWorkflow()
proof_remover_app=proof_remover_app.create_workflow()
proof_remover_app=proof_remover_app.compile()
input={"main_text_filename": HumanMessage(content="Li"),"file":[""],"report":HumanMessage(content="")}
state=proof_remover_app.invoke(input)


In [None]:
### Same as before but in a form of a tool. Just for testing purposes.

ProofRemoverTool=ProofRemovalToolClass()
ProofRemoverTool=StructuredTool(name="ProofRemovalTool",func=ProofRemoverTool.remove_proof,args_schema=ProofRemovalInput)
ProofRemoverTool.invoke({"main_text_filename": "Li"})

100%|██████████| 34/34 [02:44<00:00,  4.85s/it]

The proofs were remove and the resulted file is named Li_without_proofs





'The proofs were remove and the resulted file is named Li_without_proofs'

In [None]:
### This tool takes a text found in the folder files/markdowns and creates a set of keywords and a summary. in a form of a string and extracts the keywords and summary.
### It is preferable to use a file that doesnt contain proofs because it produces a better summary. 
input={"main_text_filename": HumanMessage(content="Li"),
           "report":HumanMessage(content=""),}
keyword_and_summary_app=KeywordAndSummaryWorkflow()
keyword_and_summary_app=keyword_and_summary_app.create_workflow()
keyword_and_summary_app=keyword_and_summary_app.compile()
state=keyword_and_summary_app.invoke(input)

In [None]:
### Same as before but in a form of a tool. Just for testing purposes.

KeywordAndSummaryTool=KeywordAndSummaryToolClass()
KeywordAndSummaryTool=StructuredTool(name="KeywordAndSummaryTool",func=KeywordAndSummaryTool.get_keyword_and_summary,args_schema=KeywordSummaryInput)
KeywordAndSummaryTool.invoke({"main_text_filename": "Li"})

keyword_and_summary in progress


 32%|███▏      | 11/34 [01:33<03:50, 10.00s/it]

In [4]:
### This workflow trnaslates the text found in the folder files/markdowns/main_text_filename  to the target language.
### it uses auxilary file for context based translation

input={"auxilary_text_filename": HumanMessage(content="Li_keyword_and_summary"), "target_language":HumanMessage(content="en"),"main_text_filename": HumanMessage(content="Li"), "report":HumanMessage}

translation_app=TranslationWorkflow()
translation_app=translation_app.create_workflow()
translation_app=translation_app.compile()
state=translation_app.invoke(input)
print(state)

AttributeError: 'NoneType' object has no attribute 'content'

In [None]:
TranslationTool =TranslationToolClass()
    
TranslationTool=StructuredTool(name="TranslationTool",func=TranslationTool.translate_file,args_schema=TranslatorInput,
                           description=TranslationTool.description)
TranslationTool.invoke(input={"keywords_and_summary_filename":"","target_language":"en","main_text_filename":"Li"})

In [None]:
### This workflow takes a text found in the folder files/markdowns and creates a set of citations.
### Extraction type should be provided (all of them/ most important)
### An auxilary text can be provided which will help judge better if the extraction type matches.

input={"auxilary_text_filename": HumanMessage(content="Li_keyword_and_summary"), "extraction_type":HumanMessage(content="All of them"),"main_text_filename": HumanMessage(content="Li"), 
"report":HumanMessage(content="")}

citation_extraction_app=CitationExtractionWorkflow()
citation_extraction_app=citation_extraction_app.create_workflow()
citation_extraction_app=citation_extraction_app.compile()
state=citation_extraction_app.invoke(input)
print(state)

In [None]:
CitationExtractionTool=CitationExtractionToolClass()

CitationExtractionTool=StructuredTool(name="CitationExtractionTool",func=CitationExtractionTool.extract_citations,args_schema=CitationExtractorInput,
                           description=CitationExtractionTool.description)  

CitationExtractionTool.invoke(input={"main_text_filename":"Li","extraction_type":"All of them","auxilary_text_filename":"Li_keyword_and_summary"})

In [None]:
### It skims a ttext and bring a very quick report. This will be only used as a tool for the manager, so it will not
### request a tool usage that corresponds to a heavy workflow if this is not necceessary. It will take a peak on the file and get an idea.
### Also useful for take citation files and sending them to the arxiv extractor.
input={"main_text_filename": HumanMessage(content="Li"),"report":HumanMessage(content="")}
take_a_peak_app=TakeAPeakWorkflow()
take_a_peak_app=take_a_peak_app.create_workflow()
take_a_peak_app=take_a_peak_app.compile()
state=take_a_peak_app.invoke(input)
print(state)