# Tagging and Extraction Using OpenAI functions

## Тэгирование

In [157]:
import os
from dotenv import load_dotenv, find_dotenv

from gigachat.models import Chat, Messages, MessagesRole, chat_completion

from langchain_gigachat.chat_models import GigaChat

_ = load_dotenv(find_dotenv())

api_key  = os.getenv('GIGACHAT_API_KEY')

model = GigaChat(credentials=api_key, verify_ssl_certs=False, temperature = 0)

In [3]:
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

In [4]:
class Tagging(BaseModel):
    """Пометьте фрагмент текста определенной информацией"""
    sentiment: str = Field(description="тональность текста должна быть 'положительной', 'отрицательной' или 'нейтральной'")
    language: str = Field(description="язык текста (должен соответствовать коду ISO 639-1)")

In [5]:
convert_pydantic_to_openai_function(Tagging)

  convert_pydantic_to_openai_function(Tagging)


{'name': 'Tagging',
 'description': 'Пометьте фрагмент текста определенной информацией',
 'parameters': {'properties': {'sentiment': {'description': "тональность текста должна быть 'положительной', 'отрицательной' или 'нейтральной'",
    'type': 'string'},
   'language': {'description': 'язык текста (должен соответствовать коду ISO 639-1)',
    'type': 'string'}},
  'required': ['sentiment', 'language'],
  'type': 'object'}}

In [6]:
from langchain.prompts import ChatPromptTemplate

In [7]:
tagging_functions = [convert_pydantic_to_openai_function(Tagging)]

In [8]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Подумайте хорошенько, а затем пометьте текст в соответствии с инструкциями"),
    ("user", "{input}")
])

In [9]:
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "Tagging"}
)

In [10]:
tagging_chain = prompt | model_with_functions

In [11]:
tagging_chain.invoke({"input": "Я люблю LangChain"})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Tagging', 'arguments': {'language': 'ru', 'sentiment': 'положительная'}}, 'functions_state_id': 'e7b303c4-8bc2-47b8-a362-6c70b239cacc'}, response_metadata={'token_usage': {'prompt_tokens': 140, 'completion_tokens': 37, 'total_tokens': 177}, 'model_name': 'GigaChat:1.0.26.20', 'finish_reason': 'function_call'}, id='run-0598fe1d-7f03-4ea1-8a38-eb793faecf10-0', tool_calls=[{'name': 'Tagging', 'args': {'language': 'ru', 'sentiment': 'положительная'}, 'id': '6f20e66f-086d-477b-b07a-930de6b4f141', 'type': 'tool_call'}])

In [12]:
#я не люблю эту еду
tagging_chain.invoke({"input": "non mi piace questo cibo"})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Tagging', 'arguments': {'language': 'it', 'sentiment': 'negative'}}, 'functions_state_id': '4fc070f4-14d8-455a-8d93-420f08ff96d7'}, response_metadata={'token_usage': {'prompt_tokens': 144, 'completion_tokens': 36, 'total_tokens': 180}, 'model_name': 'GigaChat:1.0.26.20', 'finish_reason': 'function_call'}, id='run-d1197622-b6f6-4c1d-97f1-5f40444dbac6-0', tool_calls=[{'name': 'Tagging', 'args': {'language': 'it', 'sentiment': 'negative'}, 'id': '3e970ae9-b618-404c-b074-d2ae7959df46', 'type': 'tool_call'}])

In [13]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser, OutputFunctionsParser
from langchain.schema.output_parser import StrOutputParser

ImportError: cannot import name 'OutputFunctionsParser' from 'langchain.output_parsers.openai_functions' (c:\Users\Maksim\AppData\Local\Programs\Python\Python39\lib\site-packages\langchain\output_parsers\openai_functions.py)

In [None]:
#tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()
tagging_chain = prompt | model_with_functions

In [15]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Tagging', 'arguments': {'language': 'it', 'sentiment': 'negative'}}, 'functions_state_id': '8b2aeaf2-eba2-4cf5-bcfd-b40c690fb13b'}, response_metadata={'token_usage': {'prompt_tokens': 144, 'completion_tokens': 36, 'total_tokens': 180}, 'model_name': 'GigaChat:1.0.26.20', 'finish_reason': 'function_call'}, id='run-628de4bc-9c35-4836-94e2-3f88802c6d9c-0', tool_calls=[{'name': 'Tagging', 'args': {'language': 'it', 'sentiment': 'negative'}, 'id': '3797aa1c-6de5-4364-8f61-03684ac5c8be', 'type': 'tool_call'}])

В дву последних ячейках предлагалось парсить аргументы от функции с помощью метода .JsonOutputFunctionsParser, но т.к. GigaChat возвращает сразу dict, парсинг не требуется (он и крашится), но нужно обращаться к аргументам вручную

## Extraction

Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [62]:
from typing import Optional
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="Имя человека")
    age: Optional[int] = Field(description="Возраст человека")

In [63]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="Список информации о людях")

In [64]:
convert_pydantic_to_openai_function(Information)

{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'properties': {'people': {'description': 'Список информации о людях',
    'items': {'description': 'Information about a person.',
     'properties': {'name': {'description': 'Имя человека', 'type': 'string'},
      'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
       'description': "person's age"}},
     'required': ['name', 'age'],
     'type': 'object'},
    'type': 'array'}},
  'required': ['people'],
  'type': 'object'}}

In [65]:
extraction_functions = [{
    "name": "Information",
    "description": "List of people and their details",
    "parameters": {
        "type": "object",
        "properties": {
            "people": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string", "description": "Person's name"},
                        "age": {"type": "integer", "description": "Person's age"}
                    },
                    "required": ["name"]
                }
            }
        },
        "required": ["people"]
    }
}]

In [66]:
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

In [67]:
extraction_model.invoke("Джо 30 лет, его мама - Марта")

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Information', 'arguments': {'people': [{'age': 30, 'name': 'Джо'}, {'name': 'Марта'}]}}, 'functions_state_id': '1e6bbd44-d7bd-447f-8298-39c5864f7e1d'}, response_metadata={'token_usage': {'prompt_tokens': 106, 'completion_tokens': 67, 'total_tokens': 173}, 'model_name': 'GigaChat:1.0.26.20', 'finish_reason': 'function_call'}, id='run-1095a593-3fe1-4371-94c6-067e3f2c3b8e-0', tool_calls=[{'name': 'Information', 'args': {'people': [{'age': 30, 'name': 'Джо'}, {'name': 'Марта'}]}, 'id': 'b53c5734-f4e3-4978-b214-dd62f3ea13d2', 'type': 'tool_call'}])

Метод convert_pydantic_to_openai_function неправильно сформировал JSON (по крайней мере, GigaChat "не понимает" генерируемую структуру) для класса Pydantic со списком также класса Pydantic. Так что, видимо, такие функции нужно писать вручную

In [68]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Извлеките соответствующую информацию, если она не указана явно, не пытайтесь угадать. Извлеките частичную информацию"),
    ("human", "{input}")
])

In [69]:
extraction_chain = prompt | extraction_model

In [70]:
extraction_chain.invoke({"input": "Джо 30 лет, его мама - Марта"})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Information', 'arguments': {'people': [{'age': 30, 'name': 'Джо'}, {'name': 'Марта'}]}}, 'functions_state_id': 'd9eefc4f-cd3c-47b8-a8ed-3317aebee73e'}, response_metadata={'token_usage': {'prompt_tokens': 138, 'completion_tokens': 67, 'total_tokens': 205}, 'model_name': 'GigaChat:1.0.26.20', 'finish_reason': 'function_call'}, id='run-968b1742-0e43-4e83-8d50-b913fff52dd3-0', tool_calls=[{'name': 'Information', 'args': {'people': [{'age': 30, 'name': 'Джо'}, {'name': 'Марта'}]}, 'id': '0664f46b-74db-42f4-9961-3e74ad084ffc', 'type': 'tool_call'}])

In [71]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

Метод .JsonKeyOutputFunctionsParser, по идее, должен выделить параметры по ключу, но в данном случае это вновь не работает. Всё по тем же причинам, что и ранее

In [72]:
#extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

## Doing it for real

Мы можем применить теги к большому объему текста.

Например, давайте загрузим эту запись в блоге и извлекем информацию о тегах из подмножества текста.

<span style='color:gray'>
We can apply tagging to a larger body of text.

For example, let's load this blog post and extract tag information from a sub-set of the text.
</span>

In [76]:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

In [77]:
doc = documents[0]

In [78]:
page_content = doc.page_content[:10000]

In [79]:
print(page_content[:1000])







LLM Powered Autonomous Agents | Lil'Log







































Lil'Log

















|






Posts




Archive




Search




Tags




FAQ




emojisearch.app









      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


 


Table of Contents



Agent System Overview

Component One: Planning

Task Decomposition

Self-Reflection


Component Two: Memory

Types of Memory

Maximum Inner Product Search (MIPS)


Component Three: Tool Use

Case Studies

Scientific Discovery Agent

Generative Agents Simulation

Proof-of-Concept Examples


Challenges

Citation

References





Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful gene

In [80]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Предоставьте краткое изложение содержания.")
    language: str = Field(description="Укажите язык, на котором написан контент.")
    keywords: str = Field(description="Укажите ключевые слова, относящиеся к контенту.")

In [84]:
overview_tagging_function = [
    convert_pydantic_to_openai_function(Overview)
]
tagging_model = model.bind(
    functions=overview_tagging_function,
    function_call={"name":"Overview"}
)
tagging_chain = prompt | tagging_model

In [137]:
resp = tagging_chain.invoke({"input": page_content})

In [138]:
resp.additional_kwargs['function_call']

{'name': 'Overview',
 'arguments': {'keywords': "LLM Powered Autonomous Agents, Lil'Log",
  'language': 'en-US',
  'summary': 'Overview of a section of text.'}}

In [139]:
class Paper(BaseModel):
    """Информация об упомянутых документах."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Информация для извлечения"""
    papers: List[Paper]

In [140]:
convert_pydantic_to_openai_function(Info)

{'name': 'Info',
 'description': 'Информация для извлечения',
 'parameters': {'properties': {'papers': {'items': {'description': 'Информация об упомянутых документах.',
     'properties': {'title': {'type': 'string'},
      'author': {'anyOf': [{'type': 'string'}, {'type': 'null'}]}},
     'required': ['title', 'author'],
     'type': 'object'},
    'type': 'array'}},
  'required': ['papers'],
  'type': 'object'}}

In [150]:
paper_extraction_function = [{
    "name": "Info",
    "description": "Список упомянутых документов",
    "parameters": {
        "type": "object",
        "properties": {
            "papers": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "title": {"type": "string"},
                        "author": {"type": "string"}
                    },
                    "required": ["title"]
                }
            }
        },
        "required": ["papers"]
    }
}]

In [154]:
#model = GigaChat(credentials=api_key, verify_ssl_certs=False, temperature = 0,
#                 model="GigaChat-Max")

In [158]:
extraction_model = model.bind(
    functions=paper_extraction_function, 
    function_call={"name":"Info"}
)
extraction_chain = prompt | extraction_model

In [159]:
extraction_chain.invoke({"input": page_content})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Info', 'arguments': {'papers': []}}, 'functions_state_id': '40389175-6394-4014-b177-1aa3db5e8520'}, response_metadata={'token_usage': {'prompt_tokens': 2796, 'completion_tokens': 20, 'total_tokens': 2816}, 'model_name': 'GigaChat:1.0.26.20', 'finish_reason': 'function_call'}, id='run-694a27fa-8f12-4591-bc38-e38ac929fddf-0', tool_calls=[{'name': 'Info', 'args': {'papers': []}, 'id': '2d688284-8426-48ad-a974-22dcc6e0d57d', 'type': 'tool_call'}])

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Info', 'arguments': {'papers': [{'author': 'Wei et al. 2022', 'title': 'Chain of thought (CoT)'}, {'author': 'Yao et al. 2023', 'title': 'Tree of Thoughts'}, {'author': 'Liu et al. 2023', 'title': 'LLM+P'}, {'author': 'Yao et al. 2023', 'title': 'ReAct'}, {'author': 'Shinn & Labash 2023', 'title': 'Reflexion'}, {'author': 'Liu et al. 2023', 'title': 'Chain of Hindsight (CoH)'}, {'author': 'Laskin et al. 2023', 'title': 'Algorithm Distillation (AD)'}]}}, 'functions_state_id': '5cfb229e-f7b5-457d-94b2-c32a89022574'}, response_metadata={'token_usage': {'prompt_tokens': 2349, 'completion_tokens': 266, 'total_tokens': 2615}, 'model_name': 'GigaChat-Max:1.0.26.20', 'finish_reason': 'function_call'}, id='run-aff0dfff-d8ba-45be-9ae9-5cd6d8ad0176-0', tool_calls=[{'name': 'Info', 'args': {'papers': [{'author': 'Wei et al. 2022', 'title': 'Chain of thought (CoT)'}, {'author': 'Yao et al. 2023', 'title': 'Tree of Thoughts'}, {'author': 'Liu et al. 2023', 'title': 'LLM+P'}, {'author': 'Yao et al. 2023', 'title': 'ReAct'}, {'author': 'Shinn & Labash 2023', 'title': 'Reflexion'}, {'author': 'Liu et al. 2023', 'title': 'Chain of Hindsight (CoH)'}, {'author': 'Laskin et al. 2023', 'title': 'Algorithm Distillation (AD)'}]}, 'id': '10d1138f-a51d-4ad4-b7da-0b847618f6f1', 'type': 'tool_call'}])

Т.е. большая модель - справилась. А Lite - нет

In [95]:
template = """Вам будет передана статья. Извлеките из нее все статьи, которые упоминаются в этой статье, за которыми следует ее автор. 

Не извлекайте название самой статьи. Если статьи не упоминаются, это нормально - вам не нужно их извлекать! Просто верните пустой список.

Не придумывайте и не домысливайте какую-либо дополнительную информацию. Извлекайте только то, что точно содержится в тексте."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [96]:
extraction_chain = prompt | extraction_model

In [97]:
extraction_chain.invoke({"input": page_content})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Info', 'arguments': {'Paper': [{'author': 'Lilian Weng', 'title': 'LLM Powered Autonomous Agents'}]}}, 'functions_state_id': 'cef56e44-ee19-4a20-892f-c6930d6ac6fa'}, response_metadata={'token_usage': {'prompt_tokens': 2797, 'completion_tokens': 59, 'total_tokens': 2856}, 'model_name': 'GigaChat:1.0.26.20', 'finish_reason': 'function_call'}, id='run-c6174749-6be8-4123-adeb-64492ece142b-0', tool_calls=[{'name': 'Info', 'args': {'Paper': [{'author': 'Lilian Weng', 'title': 'LLM Powered Autonomous Agents'}]}, 'id': '29b5ef44-3b9a-4027-bc12-31310249a7d8', 'type': 'tool_call'}])

In [98]:
extraction_chain.invoke({"input": "hi"})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Info', 'arguments': {}}, 'functions_state_id': 'bf50ec6e-2ba5-4f31-8926-0009fe3d7183'}, response_metadata={'token_usage': {'prompt_tokens': 176, 'completion_tokens': 11, 'total_tokens': 187}, 'model_name': 'GigaChat:1.0.26.20', 'finish_reason': 'function_call'}, id='run-10e78dfd-ac30-440d-a5e7-34cdf051ed03-0', tool_calls=[{'name': 'Info', 'args': {}, 'id': '56aad9ed-6424-40ec-a5b6-0e5324f54e2e', 'type': 'tool_call'}])

In [99]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [100]:
splits = text_splitter.split_text(doc.page_content)

In [101]:
len(splits)

15

In [102]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [103]:
flatten([[1, 2], [3, 4]])

[1, 2, 3, 4]

In [105]:
from langchain.schema.runnable import RunnableLambda

In [106]:
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

In [107]:
prep.invoke("hi")

[{'input': 'hi'}]

In [119]:
prep.invoke(splits[1])

[{'input': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or

In [171]:
chain = prep | extraction_chain.map() | flatten

In [173]:
chain.invoke(doc.page_content[:100])

[('content', ''),
 ('additional_kwargs',
  {'function_call': {'name': 'Info', 'arguments': {}},
   'functions_state_id': '93f6a69c-c625-4a3b-b0ea-bf72b22569ac'}),
 ('response_metadata',
  {'token_usage': {'prompt_tokens': 231,
    'completion_tokens': 11,
    'total_tokens': 242},
   'model_name': 'GigaChat:1.0.26.20',
   'finish_reason': 'function_call'}),
 ('type', 'ai'),
 ('name', None),
 ('id', 'run-669ac8de-7e7d-4e91-9ff3-bed0a0f0b27d-0'),
 ('example', False),
 ('tool_calls',
  [{'name': 'Info',
    'args': {},
    'id': 'd73e0df3-732c-4530-bc7f-7d025e157d1a',
    'type': 'tool_call'}]),
 ('invalid_tool_calls', []),
 ('usage_metadata', None)]

Хотелось бы верить, что более мощная модель справится с этим пайплайном, но GigaChat-Lite не справился :(
+ ограничение на кол-во запросов