In [146]:
!pip install langchain-openai langchain langchain-anthropic scikit-learn



In [133]:
import json
with open("../message.txt", "r") as file:
    eval_messages = json.load(file)

In [139]:
import json
import langchain

from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic


from typing import Optional

from langchain_core.pydantic_v1 import BaseModel, Field
import uuid
from typing import Dict, List, TypedDict

from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)
from langchain_core.pydantic_v1 import BaseModel, Field


class RealEstateParameters(BaseModel):
    rent: Optional[int] = Field(..., description="The monthly rent (Nájem) in CZK. If not specified, return null.")
    deposit: Optional[float] = Field(..., description="The deposit (Kauce) in CZK. If not specified, return null.")
    services: Optional[int] = Field(..., description="The monthly services (Služby) in CZK. If not specified, return null.")
    energy_included: Optional[bool] = Field(..., description="Whether the energy (Energie) is included in the rent. If not specified, return null.")
    agency_fee: Optional[bool] = Field(..., description="Whether the real estate agent fee (Poplatek Realitní kanceláři) must be paid. If not specified, return null.")


examples = [
    (
        '+ 750 Kč poplatky na osobu, el. a plyn se převádí na nájemce, kauce a provize',
        RealEstateParameters(rent=38000, deposit=38000, services=750, energy_included=False, agency_fee=True)
    ),
    (
        '+ 3 800 Kč poplatky pro dvě osoby, elektřina se převádí na nájemce, kauce 34 000 Kč, provize',
        RealEstateParameters(rent=16000, deposit=34000, services=3800, energy_included=False, agency_fee=True)
    ),
    (
        '+popl. 4000, Cena za všechny služby a energie',
        RealEstateParameters(rent=15000, deposit=None, services=4000, energy_included=True, agency_fee=False)
    )
]
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are prefessional real estate agent."
            "You will be given a real estate ad and you will need to extract the following information: "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value."
        ),
        MessagesPlaceholder("examples"),
        ("human", "{text}"),
    ]
)



class Example(TypedDict):
    """A representation of an example consisting of text input and expected tool calls.

    For extraction, the tool calls are represented as instances of pydantic model.
    """

    input: str  # This is the example text
    tool_calls: List[BaseModel]  # Instances of pydantic model that should be extracted


def tool_example_to_messages(example: Example) -> List[BaseMessage]:
    """Convert an example into a list of messages that can be fed into an LLM.

    This code is an adapter that converts our example to a list of messages
    that can be fed into a chat model.

    The list of messages per example corresponds to:

    1) HumanMessage: contains the content from which content should be extracted.
    2) AIMessage: contains the extracted information from the model
    3) ToolMessage: contains confirmation to the model that the model requested a tool correctly.

    The ToolMessage is required because some of the chat models are hyper-optimized for agents
    rather than for an extraction use case.
    """
    messages: List[BaseMessage] = [HumanMessage(content=example["input"])]
    openai_tool_calls = []
    for tool_call in example["tool_calls"]:
        openai_tool_calls.append(
            {
                "id": str(uuid.uuid4()),
                "type": "function",
                "function": {
                    # The name of the function right now corresponds
                    # to the name of the pydantic model
                    # This is implicit in the API right now,
                    # and will be improved over time.
                    "name": tool_call.__class__.__name__,
                    "arguments": tool_call.json(),
                },
            }
        )
    messages.append(
        AIMessage(content="", additional_kwargs={"tool_calls": openai_tool_calls})
    )
    tool_outputs = example.get("tool_outputs") or [
        "You have correctly called this tool."
    ] * len(openai_tool_calls)
    for output, tool_call in zip(tool_outputs, openai_tool_calls):
        messages.append(ToolMessage(content=output, tool_call_id=tool_call["id"]))
    return messages

messages = []

for text, tool_call in examples:
    messages.extend(
        tool_example_to_messages({"input": text, "tool_calls": [tool_call]})
    )


llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key="YOUR_API_KEY_HERE")

runnable = prompt | llm.with_structured_output(schema=RealEstateParameters)

In [138]:

runnable.invoke({"text": eval_messages[0], "examples": messages})

ParametryNemovitosti(rent=28000, deposit=None, services=6000, energy_included=False, agency_fee=False)

In [141]:
def extract_kauce(kauce:str, rent: int) -> Optional[float]:
    """Extract the deposit (Kauce) from the given text.

    The deposit is expected to be a number, but it can also be a string
    that represents a number (e.g. "1.5" for 1.5x rent).

    If the deposit is not found, return None.
    """

    if kauce is not None:
        if isinstance(kauce, str):
            if kauce.endswith("rent"):
                try:
                    multiplier = float(kauce.replace("*rent", ""))
                    return rent * multiplier
                except ValueError:
                    return None
            else:
                try:
                    return float(kauce)
                except ValueError:
                    return None
        else:
            return None
    else:
        return None


eval_offers = [
    RealEstateParameters(
        rent=sample["rent"],
        deposit=extract_kauce(sample["deposit"], sample["rent"]),
        services=sample["services"],
        energy_included=sample["energy"],
        agency_fee=sample["rk"],
    )
    for sample in eval_messages
]

eval_ads = [
    sample["add"]
    for sample in eval_messages
]

In [142]:
predicted_offers = [runnable.invoke({"text": message, "examples": messages}) for message in eval_messages]


In [None]:
from sklearn.metrics import precision_recall_fscore_support

def evaluate(y_pred, y_true):
    y_true = [{k: -1 if v is None else v for k, v in sample.items()} for sample in y_true]
    y_pred = [{k: -1 if v is None else v for k, v in sample.items()} for sample in y_pred]

    # Calculate precision, recall, and f1-score for each label
    precision_list = []
    recall_list = []
    f1_list = []
    
    precision_list = []
    recall_list = []
    f1_list = []

    for label in y_true[0].keys():
        true_label_values = [sample[label] for sample in y_true]
        pred_label_values = [sample[label] for sample in y_pred]
        precision, recall, f1, _ = precision_recall_fscore_support(true_label_values, pred_label_values, average='weighted')
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    avg_precision = sum(precision_list) / len(precision_list)
    avg_recall = sum(recall_list) / len(recall_list)
    avg_f1 = sum(f1_list) / len(f1_list)
    
    # Calculate and return the average precision, recall, and f1-score
    avg_precision = sum(precision_list) / len(precision_list)
    avg_recall = sum(recall_list) / len(recall_list)
    avg_f1 = sum(f1_list) / len(f1_list)
    
    return {"average_precision": avg_precision, "average_recall": avg_recall, "average_f1": avg_f1}

In [143]:
predicted_offers

[RealEstateParameters(rent=28000, deposit=None, services=6000, energy_included=False, agency_fee=False),
 RealEstateParameters(rent=25000, deposit=None, services=4500, energy_included=False, agency_fee=False),
 RealEstateParameters(rent=38000, deposit=None, services=750, energy_included=False, agency_fee=True),
 RealEstateParameters(rent=16000, deposit=34000.0, services=3800, energy_included=False, agency_fee=True),
 RealEstateParameters(rent=15500, deposit=31000.0, services=950, energy_included=False, agency_fee=True),
 RealEstateParameters(rent=17900, deposit=None, services=3033, energy_included=False, agency_fee=True),
 RealEstateParameters(rent=5500, deposit=None, services=1800, energy_included=False, agency_fee=False),
 RealEstateParameters(rent=37500, deposit=None, services=8000, energy_included=True, agency_fee=True),
 RealEstateParameters(rent=36000, deposit=None, services=8323, energy_included=False, agency_fee=True),
 RealEstateParameters(rent=11500, deposit=None, services=43

In [144]:
eval_offers

[RealEstateParameters(rent=28000, deposit=None, services=6000, energy_included=False, agency_fee=False),
 RealEstateParameters(rent=25000, deposit=None, services=4500, energy_included=False, agency_fee=False),
 RealEstateParameters(rent=38000, deposit=None, services=750, energy_included=False, agency_fee=True),
 RealEstateParameters(rent=16000, deposit=None, services=3800, energy_included=False, agency_fee=True),
 RealEstateParameters(rent=15500, deposit=31000.0, services=950, energy_included=False, agency_fee=True),
 RealEstateParameters(rent=17900, deposit=None, services=3033, energy_included=False, agency_fee=True),
 RealEstateParameters(rent=5500, deposit=None, services=1800, energy_included=False, agency_fee=False),
 RealEstateParameters(rent=37500, deposit=None, services=8000, energy_included=True, agency_fee=True),
 RealEstateParameters(rent=36000, deposit=None, services=8323, energy_included=False, agency_fee=True),
 RealEstateParameters(rent=11500, deposit=None, services=4300,

In [145]:
evaluate([
    x.dict() for x in predicted_offers
], [
    x.dict() for x in eval_offers
])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'average_precision': 1.0,
 'average_recall': 0.967741935483871,
 'average_f1': 0.9819954988747186}

In [22]:
!pip install 

Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.12.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (217 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m217.9/217.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.3.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.1.post1-cp312-cp312-macosx_12_0_arm64.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hUsing cached joblib-1.3.2-py3-none-any.whl (302 kB)
Downloading scipy-1.12.0-cp312-cp312-macosx_12_0_arm64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [24]:
!pip install pandas

Collecting pandas
  Using cached pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl (11.3 MB)
Using cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.1 pytz-2024.1 tzdata-2024.1


In [34]:
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd

precision_recall_fscore_support([1,2], [0,0], average='weighted')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.0, 0.0, 0.0, None)

In [25]:
examples

[('+ 750 Kč poplatky na osobu, el. a plyn se převádí na nájemce, kauce a provize',
  RealEstateAd(nájem=38000, kauce=1.0, služby=750, včetně_energií=False, poplatek_realitní_kanceláři=True)),
 ('+ 3 800 Kč poplatky pro dvě osoby, elektřina se převádí na nájemce, kauce 34 000 Kč, provize',
  RealEstateAd(nájem=16000, kauce=2.125, služby=3800, včetně_energií=False, poplatek_realitní_kanceláři=True)),
 ('+popl. 4000, Cena za všechny služby a energie',
  RealEstateAd(nájem=15000, kauce=None, služby=4000, včetně_energií=True, poplatek_realitní_kanceláři=False))]

In [70]:
examples[2]["rent"]

34900

In [42]:
from langchain.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)
import json

def create_example():
    # This is a prompt template used to format each individual example.
    import random
    examples = random.sample(eval_messages, 3)

    def example_to_message(example):
        return (
f"""\
Input: {json.dumps({ "nájem": example["nájem"], "kauce": example["kauce"], "služby": example["služby"], "včetně_energií": example["včetně_energií"], "poplatek_realitní_kanceláři": example["poplatek_realitní_kanceláři"] })}
Output: {example["ad"]}
"""
)
    example_messages = "\n\n".join(example_to_message(example) for example in examples)
    print(example_messages)
    final_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", 
"""\
You are an profesional real estate agent. 

# Task
You will be given a real estate parameters and your task is to create a real estate ad in Czech.

# Input Format
You will be given a JSON object with the following parameters:
- nájem: The monthly rent (Nájem) in CZK.
- kauce: The deposit (Kauce) in terms of multiple of rent.
- služby: The monthly services (Služby) in CZK.
- včetně_energií: Whether the energy (Energie) is included in the rent.
- poplatek_realitní_kanceláři: Whether the real estate agent fee (Poplatek Realitní kanceláři) must be paid.

# Output Format
You must output the ad in Czech. The ad must be in the the similar format as examples.

# Examples
{examples}


You must exactly follow the parameters and include all of them that are filled. You must output the ad in Czech. The ad must be in the same format as Examples.\
"""),
            ("human", "{input}"),
        ]
    )
    from langchain_community.chat_models import ChatOpenAI, ChatAnthropic

    llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0.7, api_key="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")
    chain = final_prompt | llm


    def generate_random_real_estate_parameters():
        rent_options = list(range(15000, 35001, 500))
        deposit_options = [None, "1*rent", "2*rent", "3*rent"] + list(range(15000, 35001, 500))
        services_options = list(range(500, 15001, 250))
        energy_options = [True, False]
        rk_options = [True, False]
        
        rent = random.choice(rent_options)
        deposit = random.choice(deposit_options)
        services = random.choice(services_options)
        energy = random.choice(energy_options)
        rk = random.choice(rk_options)
        
        real_estate_parameters = {
            "nájem": rent,
            "kauce": deposit,
            "služby": services,
            "včetně_energií": energy,
            "poplatek_realitní_kanceláři": rk
        }
        
        return json.dumps(real_estate_parameters)
    query = generate_random_real_estate_parameters()
    return query, chain.invoke({"input": query, "examples": example_messages}).content


generated_pairs = []
for i in range(1):
    generated_pairs.append(create_example())
    print(generated_pairs[-1])


Input: {"n\u00e1jem": 28000, "kauce": null, "slu\u017eby": 6000, "v\u010detn\u011b_energi\u00ed": false, "poplatek_realitn\u00ed_kancel\u00e1\u0159i": false}
Output: Záloha na společné služby, vytápění, vodu a údržbu garáže: 6.000 Kč/měs. Elektřina se platí zvlášť.


Input: {"n\u00e1jem": 25000, "kauce": null, "slu\u017eby": 4500, "v\u010detn\u011b_energi\u00ed": false, "poplatek_realitn\u00ed_kancel\u00e1\u0159i": false}
Output: Zálohy na poplatky za společné domovní služby, vodu a vytápění 4.500 Kč/měs. pro 2 osoby. Elektřina se hradí zvlášť.


Input: {"n\u00e1jem": 34900, "kauce": "2*rent", "slu\u017eby": 6750, "v\u010detn\u011b_energi\u00ed": false, "poplatek_realitn\u00ed_kancel\u00e1\u0159i": true}
Output: Poplatky fix 900 Kč/os./měs. + zál. voda 350 Kč/os./měs. + zál. plyn 5 500 Kč/byt/měs. + převod elektřiny. Kauce 2 nájmy. RK provize 1 nájem, neplatít

('{"n\\u00e1jem": 28000, "kauce": 24500, "slu\\u017eby": 8500, "v\\u010detn\\u011b_energi\\u00ed": true, "poplatek_realitn\\u0

In [43]:
generated_pairs[0]

('{"n\\u00e1jem": 28000, "kauce": 24500, "slu\\u017eby": 8500, "v\\u010detn\\u011b_energi\\u00ed": true, "poplatek_realitn\\u00ed_kancel\\u00e1\\u0159i": false}',
 'Nájem: 28.000 Kč/měs., kauce: 24.500 Kč. Záloha na služby včetně energií: 8.500 Kč/měs. Poplatek realitní kanceláři neplatíte.')

In [47]:
tunning_messages = [
    {"messages": [
        {
            "role": "user",
            "content": m[1]
        },
        {
            "role": "assistant",
            "function_call": {
                "name": "parametry_nemovitosti",
                "arguments": json.dumps(json.loads(m[0]))
            }
        }
    ],
    "functions": [
        {
        "name": "parametry_nemovitosti",
        "parameters": RealEstateParameters.schema()
        }
    ]

}
for m in generated_pairs]

print(json.dumps(tunning_messages[0]))

with open('generated_lines.jsonl', 'w') as file:
    for pair in tunning_messages:
        file.write(json.dumps(pair) + '\n')


{"messages": [{"role": "user", "content": "N\u00e1jem: 28.000 K\u010d/m\u011bs., kauce: 24.500 K\u010d. Z\u00e1loha na slu\u017eby v\u010detn\u011b energi\u00ed: 8.500 K\u010d/m\u011bs. Poplatek realitn\u00ed kancel\u00e1\u0159i neplat\u00edte."}, {"role": "assistant", "function_call": {"name": "parametry_nemovitosti", "arguments": "{\"n\\u00e1jem\": 28000, \"kauce\": 24500, \"slu\\u017eby\": 8500, \"v\\u010detn\\u011b_energi\\u00ed\": true, \"poplatek_realitn\\u00ed_kancel\\u00e1\\u0159i\": false}"}}], "functions": [{"name": "parametry_nemovitosti", "parameters": {"title": "ParametryNemovitosti", "type": "object", "properties": {"n\u00e1jem": {"title": "N\u00e1jem", "description": "The monthly rent (N\u00e1jem) in CZK. If not specified, return null.", "type": "integer"}, "kauce": {"title": "Kauce", "description": "The deposit (Kauce) in terms of multiple of rent. If not specified, return null.", "type": "number"}, "slu\u017eby": {"title": "Slu\u017eby", "description": "The monthly ser

AIMessage(content='Pronájem bytu za 24 500 Kč/měsíc včetně služeb ve výši 14 500 Kč/měsíc. Kauce ve výši 21 000 Kč. Pro více informací nás kontaktujte.', response_metadata={'finish_reason': 'stop', 'logprobs': None})

In [45]:
runnable = prompt | llm.with_structured_output(schema=RealEstateParameters)

In [46]:
from langchain.callbacks import get_openai_callback
text = "+ 8.000,- Kč zálohy na služby a energie + kauce + provize RK"

with get_openai_callback() as cb:
    print(runnable.invoke({"text": text, "examples":messages}))
    print(cb.total_cost)

nájem=None kauce=2.0 služby=8000 včetně_energií=True poplatek_realitní_kanceláři=True
0.0010665


In [19]:
cb.total_cost

0.00558

ModuleNotFoundError: No module named 'sklearn'