In [None]:
from openai import OpenAI
import os
import json
from redis import Redis
from redis.commands.search.query import Query
from redis.commands.search.field import (
    TextField,
    VectorField
)
from IPython.display import clear_output, display

# Constants
VECTOR_DIM = 1024 
DISTANCE_METRIC = "COSINE"                # distance metric for the vectors (ex. COSINE, IP, L2)
INDEX_NAME = "AliyunQA"

r = Redis() #Initialize Redis client with default settings

API_KEY = "#此处填写你的APIKEY"
client = OpenAI(
    api_key=API_KEY,  # 如果您没有配置环境变量，请在此处用您的API Key进行替换
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"  # 百炼服务的base_url
)

In [3]:
def deduplicate(data, key):
    seen = set()
    result = []
    for item in data:
        if item[key] not in seen:
            seen.add(item[key])
            result.append(item)
    return result

with open('scrapy-prj-aliyunecs/qa.json', 'r') as f:
    data = json.load(f)

print(str(len(data)) + ' -> ', end='')

data = deduplicate(data, 'url')

print(len(data))

data[:20]

335 -> 176


[{'url': 'https://help.aliyun.com/zh/ecs/support/troubleshooting-1/',
  'content': '<section class="aliyun-docs-view" id="aliyun-docs-view">\n<div class="product-detail">\n<section class="aliyun-docs-content">\n<header class="aliyun-docs-view-header">\n<div class="header-topbar">\n<div class="header-topbar-breadcrumb">\n <a href="https://help.aliyun.com/">首页</a>\n <span><a href="/zh/ecs/">云服务器 ECS</a></span>\n <span><a href="/zh/ecs/support/">服务支持</a></span>\n <span>故障排除</span>\n </div>\n</div>\n<div class="header-title">\n<h1>故障排除</h1>\n<div class="header-actionbar">\n<span class="update-time">更新时间: 2023-02-20 11:11:00</span>\n</div>\n</div>\n</header>\n<div class="pc-markdown-container unionContainer" id="pc-markdown-container">\n<!-- 内容 -->\n<div class="markdown-body">\n <div class="directory">\n<ul>\n <li><a href="/zh/ecs/support/examples-of-remote-connection-problems/">实例远程连接问题</a></li>\n <li><a href="/zh/ecs/support/ecs-instances-within-the-website-slow-or-cannot-access-problems/

In [None]:
# 运行前请 pip install dashscope
from dashscope import get_tokenizer

# 获取tokenizer对象，目前只支持通义千问系列模型
tokenizer = get_tokenizer('qwen-turbo')
processed = 0
MAX_RETRIES = 3

for item in data:
    n_tokens = len(tokenizer.encode(item['content']))
    print(n_tokens)

    collected_resp_content = ""
    retry_count = 0
    messages = [{
                  "role": "user", "content": "将如下html中提取文本转为md格式，注意保留换行和编号，直接输出转换后的md文本，不要多说其他的：" + item['content']}]

    while True:
        try:
            completion = client.chat.completions.create(
                model="qwen-turbo",  # 模型列表：https://help.aliyun.com/zh/model-studio/getting-started/models
                messages=messages,
                temperature=0,
            )
            item['md'] = completion.choices[0].message.content
            m_tokens = len(tokenizer.encode(completion.choices[0].message.content))
            print(" -> " + str(m_tokens))
            print(completion.choices[0].message.content[:50])
            processed += 1
            break # Break out of the retry loop
        except Exception as e:
            print("Exception: " + str(e))
            retry_count += 1
            if retry_count == MAX_RETRIES:
                print("Failed to process: " + item['url'])
                break
            print("Retry: " + str(retry_count))
            continue

print("Processed: " + str(processed))

In [5]:
import numpy as np
# Create search index

# define RediSearch vector fields to use FLAT index
md_embedding = VectorField("md_embedding",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC
    }
)

# Define RediSearch fields for each of the columns in the dataset
# This is where you should add any additional metadata you want to capture
url = TextField("url")
content = TextField("content")
md = TextField("md")

feilds = [url, content, md, md_embedding]

try:
    r.ft(INDEX_NAME).create_index(fields=feilds)
except Exception as e:
    print(e)
print(r.ft(INDEX_NAME).info())

for item in data:
    # Add input validation before creating embeddings
    if not item['md'] or len(item['md']) == 0:
        print(f"Skipping empty content for URL: {item['url']}")
        continue

    # Truncate text to maximum allowed length (8192 tokens)
    max_length = 8192 * 2  # Assuming ~2 chars per token as general guidance
    truncated_content = item['md'][:max_length]
    try:
        # Create embedding with text-embedding model of aliyun
        embedding_response = client.embeddings.create(input=truncated_content, model="text-embedding-v3", dimensions=1024, encoding_format="float")
    except Exception as e:
        print(f"Error processing URL: {item['url']}")
        print(f"Content length: {len(item['md'])} characters")
        print(f"Error details: {str(e)}")
        continue    
    # Prepare embedding vector for RediSearch
    md_embedding = np.array(embedding_response.data[0].embedding, dtype=np.float32).tobytes()
    key = f"{INDEX_NAME}:{item['url']}"
    r.hset(key, mapping={'md_embedding': md_embedding, 'md': item['md'], 'url': item['url'], 'content': item['content']})

Index already exists
{'index_name': 'AliyunQA', 'index_options': [], 'index_definition': [b'key_type', b'HASH', b'prefixes', [b''], b'default_score', b'1'], 'attributes': [[b'identifier', b'url', b'attribute', b'url', b'type', b'TEXT', b'WEIGHT', b'1'], [b'identifier', b'content', b'attribute', b'content', b'type', b'TEXT', b'WEIGHT', b'1'], [b'identifier', b'md', b'attribute', b'md', b'type', b'TEXT', b'WEIGHT', b'1'], [b'identifier', b'md_embedding', b'attribute', b'md_embedding', b'type', b'VECTOR', b'algorithm', b'FLAT', b'data_type', b'FLOAT32', b'dim', 1024, b'distance_metric', b'COSINE']], 'num_docs': 175, 'max_doc_id': 175, 'num_terms': 36485, 'num_records': 70381, 'inverted_sz_mb': '4.107590675354004', 'vector_index_sz_mb': '4.015533447265625', 'total_inverted_index_blocks': 36587, 'offset_vectors_sz_mb': '0.35366153717041016', 'doc_table_size_mb': '0.03444099426269531', 'sortable_values_size_mb': '0', 'key_table_size_mb': '0.0074672698974609375', 'tag_overhead_sz_mb': '0', 't

In [7]:
# Search
user_question = "HTTP错误500.19 - Internal Server Error可能是什么原因？如何解决？"
# user_question = "VNC能连进去，但是只有黑屏，怎么解决？"
# user_question = "VNC能连进去，但是ssh登不上，防火墙、安全组配置已检查，22端口是放开的"
# user_question = "su命令切换用户失败，提示Permission Denined"

# Helper functions
def json_gpt(input: str):
    completion = client.chat.completions.create(
        model="qwen-plus",
        messages=[
            {"role": "system", "content": "Output only valid JSON"},
            {"role": "user", "content": input},
        ],
        temperature=0.2,
    )

    text = completion.choices[0].message.content
    parsed = json.loads(text)

    return parsed


HA_INPUT = f"""
You are an Aliyun operations assistant
You have access to a search API that returns troubleshooting articles.
Generate search query extracting key words from the user's question.

User question: {user_question}

Format: {{"searchQuery": "search query"}}
"""
query_str = json_gpt(HA_INPUT)["searchQuery"]
print(query_str)

query_embedding = client.embeddings.create(input=query_str, model="text-embedding-v3", dimensions=1024, encoding_format="float")
query_vec = np.array(query_embedding.data[0].embedding, dtype=np.float32).tobytes()
# Prepare the query
query_base = (Query("*=>[KNN 2 @md_embedding $vec as score]").sort_by("score").return_fields("score", "url", "md").dialect(2))
query_param = {"vec": query_vec}
query_results = r.ft(INDEX_NAME).search(query_base, query_param).docs
print(query_results[0].md[:50] + "\n" + query_results[1].md[:50])
result_md = query_results[0].md + "\n\n" + query_results[1].md

HTTP 500.19 内部服务器错误 原因 解决方法
# IIS Web网站访问故障

## 访问部署在Windows实例上的基于IIS搭建的Web网站时
```
# 访问Windows实例上使用IIS部署的网站时，出现“500.19-xxx中不允许绝对物


In [None]:
system_prompt = "你是一个阿里云运维助手。请根据搜索结果回答用户提问，注意，请务必首先依赖搜索结果，而不是你自己已有的知识。如果搜索结果中包含了具体操作步骤，也请据此给用户具体操作指引。"
messages = [{"role": "system", "content": system_prompt},
            {"role": "user", "content": "用户提问：" + query_str},
            {"role": "user", "content": "搜索结果：" + result_md}]
response = client.chat.completions.create(
                    messages=messages,
                    model="deepseek_r1", #"qwen-max",
                    max_tokens=1000
                )
print(response.choices[0].message.content)
print(f"\n参考文档：\n\n{query_results[0].url}\n\n{query_results[1].url}")

根据搜索结果，HTTP 500.19 内部服务器错误通常是因为 `web.config` 文件中存在格式不正确或无法识别的 XML 元素导致。要解决这个问题，请按照以下步骤操作：

### 解决方案

1. **远程连接到 Windows 实例**：
   - 您可以通过多种方式远程连接到您的 Windows 实例，具体方法请参见[连接方式概述](/zh/ecs/user-guide/connection-methods)。

2. **打开服务器管理器**：
   - 在桌面左下角点击 <img src="https://help-static-aliyun-doc.aliyuncs.com/assets/img/zh-CN/4605256761/p562603.jpg" alt="开始图标" width="25">，选择“服务器管理器”。

3. **打开网站的物理目录**：
   1. 在“服务器管理器”页面，选择“工具” > “Internet Information Services (IIS) 管理器”。
   2. 在 IIS 管理器左侧导航栏找到目标网站，然后在右侧的操作区域单击“浏览”，进入该网站的物理目录。

4. **编辑 web.config 文件**：
   - 打开站点目录下的 `web.config` 配置文件。
   - 删除配置文件中的 `<httpErrors>` 节点内容，示例如下：
     ```xml
     <httpErrors>
         <remove statusCode="500" subStatusCode="-1" />
     </httpErrors>
     ```
   - 保存并关闭 `web.config` 文件。

5. **重启 IIS**：
   1. 在桌面左下角点击 <img src="https://help-static-aliyun-doc.aliyuncs.com/assets/img/zh-CN/8101605761/p553311.jpg" alt="开始图标" width="25">，在搜索框输入 `iisreset`。
   2. 在最佳匹配区域，点击 `iisreset`。
   3. 当 IIS 重启程序运行完毕后，表示 IIS 重启成功。

6. *

In [23]:
#print(result_md)
r.ft(INDEX_NAME).dropindex(delete_documents=True) #！！！！危险！不仅仅会删除INDEX_NAME的index和全文，而且好像会把别的index也干掉！

b'OK'