ollama 异步协程的API调用

In [23]:
from typing import List
import asyncio
from tqdm.asyncio import tqdm_asyncio
from langchain.schema import SystemMessage, HumanMessage

In [None]:
!pip install langchain-community

In [24]:
from langchain_community.llms import Ollama

In [25]:
# llm = Ollama(model="qwen2.5:7b")
llm = Ollama(model="qwen2.5:3b")

In [28]:
llm.invoke("Hello, who are you?")

'Hello! I am Qwen, designed by Alibaba Cloud to assist and engage with users like you. How can I help you today?'

In [29]:
queries = [
    "Hello, who are you?",
    "你好，请问你是谁？",
    "续写：练的身形似鹤形，千株松下两函经。",
    "计算：10 + 33 x 2 = ?",
] * 10

In [None]:
# # 初始化模型
# llm = ChatOpenAI(
#     model_name="qwen2.5:7b",
#     # model_name="deepseek-reasoner",
#     # openai_api_key=api_key,
#     openai_api_base="https://127.0.0.1:11434/v1",
# )

In [None]:
# results = await tqdm_asyncio.gather(*tasks)

In [30]:
async def call_llm_async(query: str) -> str:
    messages = [
        SystemMessage(content="You are a helpful assistant"),
        HumanMessage(content=query),
    ]
    response = await llm.ainvoke(messages)
    # return response.content
    return response


async def batch_call_llm(queries: List[str], concurrency: int = 5) -> List[str]:
    semaphore = asyncio.Semaphore(concurrency)

    async def limited_call(query: str):
        async with semaphore:
            return await call_llm_async(query)

    tasks = [limited_call(query) for query in queries]
    # return await asyncio.gather(*tasks)
    return await tqdm_asyncio.gather(*tasks)

In [31]:
# for python script 
# responses = asyncio.run(batch_call_llm(queries, concurrency=10))

# for jupyter
response = await batch_call_llm(queries, concurrency=3)

100%|██████████| 40/40 [00:58<00:00,  1.45s/it]


In [None]:
response

In [32]:
from tqdm import tqdm

res = []
for query in tqdm(queries):
    res.append(llm.invoke(query))

100%|██████████| 40/40 [01:37<00:00,  2.43s/it]


In [None]:
seed_res