In [None]:
import os
import json
import time
from dotenv import load_dotenv
from langchain_mistralai import ChatMistralAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
load_dotenv()

from langchain_core.rate_limiters import InMemoryRateLimiter

rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.3,  # <-- Super slow! We can only make a request once every 10 seconds!!
    check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
    max_bucket_size=10,  # Controls the maximum burst size.
)

api_key = os.environ.get("MISTRAL_API_KEY")
if not api_key:
    raise ValueError("MISTRAL_API_KEY is not set or could not be loaded!")

model = ChatMistralAI(model="mistral-large-latest", api_key=api_key, rate_limiter=rate_limiter)

from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

doc = documents[0]
#page_content = doc.page_content
#page_content = doc.page_content[:10000]

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)
splits = text_splitter.split_text(doc.page_content)

def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

#Each chunk is into text format but chain takes dict so we have turn each chunk into dict.
#Each split converted to a list of dict.

from langchain.schema.runnable import RunnableLambda
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)


from typing import Optional
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

paper_extraction_function = [
    convert_pydantic_to_openai_function(Info)
]

extraction_model = model.bind_tools(
    tools=paper_extraction_function, 
)

template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])


In [37]:
extraction_chain = prompt | extraction_model


In [38]:
chain = prep | extraction_chain.map() | flatten


In [39]:
ai_response = chain.invoke(doc.page_content)

In [43]:
print(type(ai_response))

<class 'list'>


In [47]:
all_papers = []

# Iterate through each response in ai_response
for output_tuple in ai_response:
    # Extract the actual dictionary from the tuple (assuming the dictionary is the first element in the tuple)
    output = output_tuple[1]  # Use [1] to get the dictionary part of the tuple

    # Check if output is not None and if there are tool calls in the response
    if output and 'tool_calls' in output:
        for tool_call in output['tool_calls']:
            # Extract the papers from the 'Info' function arguments
            papers = json.loads(tool_call['function']['arguments']).get('papers', [])
            # Add papers to the all_papers list
            all_papers.extend(papers)

In [48]:
# Optional: Remove duplicate papers based on title
unique_papers = []
seen_titles = set()

for paper in all_papers:
    title = paper.get('title')
    if title not in seen_titles:
        unique_papers.append(paper)
        seen_titles.add(title)

# Print the unique papers
print(json.dumps(unique_papers, indent=2))

[
  {
    "title": "Chain of thought",
    "author": "Wei et al"
  },
  {
    "title": "Tree of Thoughts",
    "author": "Yao et al"
  },
  {
    "title": "LLM+P",
    "author": "Liu et al"
  },
  {
    "title": "ReAct",
    "author": "Yao et al"
  },
  {
    "title": "Reflexion",
    "author": "Shinn & Labash"
  },
  {
    "title": "Chain of Hindsight",
    "author": "Liu et al."
  },
  {
    "title": "Algorithm Distillation",
    "author": "Laskin et al."
  },
  {
    "title": "Laskin et al. 2023",
    "author": null
  },
  {
    "title": "Duan et al. 2017",
    "author": null
  },
  {
    "title": "Miller 1956",
    "author": null
  },
  {
    "title": "MRKL",
    "author": "Karpas et al. 2022"
  },
  {
    "title": "TALM",
    "author": "Parisi et al. 2022"
  },
  {
    "title": "Toolformer",
    "author": "Schick et al. 2023"
  },
  {
    "title": "HuggingGPT",
    "author": "Shen et al. 2023"
  },
  {
    "title": "API-Bank",
    "author": "Li et al. 2023"
  },
  {
    "title": "

Note: Instead of Parsing, we can try model itself to extract the information from "ai_response"