In [None]:
 
import requests
import time
from datetime import datetime
import pytz
 
def get_bj_time():
    beijing_tz = pytz.timezone('Asia/Shanghai')
    return datetime.now(beijing_tz).strftime("%Y-%m-%d %H:%M:%S")
 
while True:
 
    data = {"model": "llamafamily/llama3-chinese-8b-instruct", "keep_alive": "5m"}
    headers = {'Content-Type': 'application/json'}
    high_precision_time = time.perf_counter()
    response = requests.post('http://localhost:11434/api/generate', json=data, headers=headers)
    high_precision_time_end = time.perf_counter()
    time1 = high_precision_time_end-high_precision_time
    print(f"高精度時間（精确到微秒）: {time1*1000:.6f}")
    jsonResponse = response.content.decode('utf-8')  # 将 bytes 转换为字符串以便打印
    print(jsonResponse)
    print(f"當前時間：{get_bj_time()}")
    time.sleep(280)  # 暂停280秒后再次执行
 
    # '''
    # 7b初次加载模型时间：3.867187177s， 第二次加载模型时间：0.766666ms
    # 14b初次加载模型时间：5.180146173s , 第二次加载模型时间：0.753414ms
    # 72b初次加载模型时间：16.991763358s，第二次加载模型时间：1.358505ms

In [2]:
from langchain_community.llms import Ollama

model_name = 'llama3.1:8b'

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use five sentences maximum and keep the "
    "answer concise. Answer in Chinese."
    "\n\n"
    "{context}"
)

llm = Ollama(model=model_name, temperature=0, top_p=0.3)

In [4]:
import psutil
import GPUtil
import platform
import time
import subprocess

# 获取 GPU 名称（只保留最后部分，例如 "3090"）
gpus = GPUtil.getGPUs()
gpu_name = gpus[0].name.split()[-1] if gpus else "Unknown_GPU"

# 获取 CPU 型号
cmd = 'wmic cpu get Name'
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
output = proc.communicate()[0].decode('utf-8').strip()

# 分割输出并去掉可能存在的多余空白行
lines = [line.strip() for line in output.splitlines() if line.strip()]
cpu_name_line = lines[1] if len(lines) > 1 else "Unknown_CPU"

# 初始化变量
cpu_name = "Unknown_CPU"
cpu_model = "Unknown_Model"

# 提取 CPU 型号中的关键部分，例如 "i7" 和 "12700"
cpu_name_parts = cpu_name_line.split()
for part in cpu_name_parts:
    if "i7" in part or "i9" in part or "i5" in part:
        cpu_name = part.split('-')[0]  # 只保留 "i7"
        cpu_model = part.split('-')[-1]  # 只保留 "12700"
        break

# 如果 CPU 型号没有直接分割到，尝试在其他部分提取
if cpu_model == "Unknown_Model":
    for part in cpu_name_parts:
        if any(char.isdigit() for char in part):
            cpu_model = part
            break

cpu_name = f"{cpu_name}_{cpu_model}"

# 自动生成文件名
report_file_name = f"ollama_test_report_{gpu_name}_{cpu_name}.txt"
history_file_name = f"ollama_test_history_{gpu_name}_{cpu_name}.txt"

# 打印文件名以验证
print(f"Report file: {report_file_name}")
print(f"History file: {history_file_name}")


# Initialize lists to store results
execution_times = []
cpu_usages = []
memory_usages = []
gpu_usages_list = []
gpu_memory_usages_list = []
responses = []

# Define the number of iterations
num_iterations = 10

# Open the history file to append data
with open(history_file_name, "w") as history_file:
    history_file.write(f"file_name: {history_file_name}")
    for i in range(num_iterations):
        # Start measuring
        process = psutil.Process()
        gpus = GPUtil.getGPUs()
        start_time = time.time()

        cpu_start = process.cpu_percent(interval=None)
        memory_start = process.memory_info().rss

        # Your code here
        question = "于某，男，62岁。患冠心病两年，服西药治疗，一日三次，从未有断，然胸憋心悸，一直不止。近月余，每至夜则咳嗽哮喘，痰涎清稀如水，倚息不能平卧，胸憋心悸尤甚。白昼则症状减轻。询知腰脊酸困，背畏风寒，时眩晕，手足心微热，口渴欲饮，但不多饮，亦不思冷，纳便尚可，舌尖略红，苔白腻，脉沉缓。给出中医诊断和处方建议"
        response = llm.invoke(question)

        # End measuring
        cpu_end = process.cpu_percent(interval=None)
        memory_end = process.memory_info().rss
        end_time = time.time()

        # Calculate metrics
        cpu_usage = cpu_end - cpu_start
        memory_usage = (memory_end - memory_start) / (1024 * 1024)  # Convert to MB
        execution_time = end_time - start_time

        gpu_usages = [gpu.load * 100 for gpu in gpus]
        gpu_memory_usages = [gpu.memoryUsed for gpu in gpus]

        # Store results
        execution_times.append(execution_time)
        cpu_usages.append(cpu_usage)
        memory_usages.append(memory_usage)
        gpu_usages_list.append(gpu_usages)
        gpu_memory_usages_list.append(gpu_memory_usages)
        responses.append(response)

        # Write each iteration's result to the history file
        history_file.write(f">>> Iteration {i+1} <<<\n")
        history_file.write(f"Execution Time  : {execution_times[i]:.2f} seconds\n")
        history_file.write(f"CPU Usage       : {cpu_usages[i]:.2f} %\n")
        history_file.write(f"Memory Usage    : {memory_usages[i]:.2f} MB\n")
        history_file.write(f"GPU Usage       : {gpu_usages_list[i]} %\n")
        history_file.write(f"GPU Memory Usage: {gpu_memory_usages_list[i]} MB\n")
        history_file.write(f"LLM Response    : {responses[i]}\n")
        history_file.write("-" * 40 + "\n\n")

        # Print progress
        print(f"Iteration {i+1}/{num_iterations} completed.")

print(f"History output saved to {history_file_name}")
# Calculate statistics for the report
average_execution_time = sum(execution_times) / num_iterations
average_cpu_usage = sum(cpu_usages) / num_iterations
average_memory_usage = sum(memory_usages) / num_iterations
average_gpu_usages = [sum(gpu_usages) / num_iterations for gpu_usages in zip(*gpu_usages_list)]
average_gpu_memory_usages = [sum(gpu_mem) / num_iterations for gpu_mem in zip(*gpu_memory_usages_list)]

max_execution_time = max(execution_times)
max_index = execution_times.index(max_execution_time)

min_execution_time = min(execution_times)
min_index = execution_times.index(min_execution_time)


# Open the report file to write the summary
with open(report_file_name, "w") as report_file:
    # Write average results
    report_file.write("=== Performance Measurement Report ===\n\n")
    report_file.write(f"file_name: {report_file_name}\n")
    report_file.write(f"model_name: {model_name}\n")
    report_file.write(f"question: {question}\n\n")
    
    report_file.write(">>> Average Results <<<\n")
    report_file.write(f"Average Execution Time  : {average_execution_time:.2f} seconds\n")
    report_file.write(f"Average CPU Usage       : {average_cpu_usage:.2f} %\n")
    report_file.write(f"Average Memory Usage    : {average_memory_usage:.2f} MB\n")
    report_file.write(f"Average GPU Usage       : {average_gpu_usages} %\n")
    report_file.write(f"Average GPU Memory Usage: {average_gpu_memory_usages} MB\n")
    report_file.write("-" * 40 + "\n\n")

    # Write max results
    report_file.write(">>> Maximum Execution Time <<<\n")
    report_file.write(f"Execution Time  : {execution_times[max_index]:.2f} seconds (Iteration {max_index + 1})\n")
    report_file.write(f"CPU Usage       : {cpu_usages[max_index]:.2f} %\n")
    report_file.write(f"Memory Usage    : {memory_usages[max_index]:.2f} MB\n")
    report_file.write(f"GPU Usage       : {gpu_usages_list[max_index]} %\n")
    report_file.write(f"GPU Memory Usage: {gpu_memory_usages_list[max_index]} MB\n")
    # report_file.write(f"LLM Response    : {responses[max_index]}\n")
    report_file.write("-" * 40 + "\n\n")

    # Write min results
    report_file.write(">>> Minimum Execution Time <<<\n")
    report_file.write(f"Execution Time  : {execution_times[min_index]:.2f} seconds (Iteration {min_index + 1})\n")
    report_file.write(f"CPU Usage       : {cpu_usages[min_index]:.2f} %\n")
    report_file.write(f"Memory Usage    : {memory_usages[min_index]:.2f} MB\n")
    report_file.write(f"GPU Usage       : {gpu_usages_list[min_index]} %\n")
    report_file.write(f"GPU Memory Usage: {gpu_memory_usages_list[min_index]} MB\n")
    # report_file.write(f"LLM Response    : {responses[min_index]}\n")
    report_file.write("-" * 40 + "\n\n")

    report_file.write("=== End of Report ===\n")

print(f"Report output saved to {report_file_name}")

Report file: ollama_test_report_4090_i9_14900.txt
History file: ollama_test_history_4090_i9_14900.txt
Iteration 1/10 completed.
Iteration 2/10 completed.
Iteration 3/10 completed.
Iteration 4/10 completed.
Iteration 5/10 completed.
Iteration 6/10 completed.
Iteration 7/10 completed.
Iteration 8/10 completed.
Iteration 9/10 completed.
Iteration 10/10 completed.
History output saved to ollama_test_history_4090_i9_14900.txt
Report output saved to ollama_test_report_4090_i9_14900.txt


In [5]:
import ollama
response = ollama.chat(model='llama3.1', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])
print(response['message']['content'])

ConnectError: [WinError 10049] 內容中所要求的位址不正確。