In [1]:
import os

# 定义保存数据的列表
data_list = []

# 设置目录路径
directory = 'kmeans_clustered_sentences'

# 遍历目录下的每个文件
for filename in os.listdir(directory):
    if filename.endswith(".txt"):  # 确保只读取 .txt 文件
        class_name = filename.replace(".txt", "")  # 获取文件名作为 class 值
        file_path = os.path.join(directory, filename)
        
        # 读取文件的每一行
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                # 组成字典，包含 data 和 class 两个字段
                data_dict = {
                    "data": line.strip(),  # 去掉行末的换行符
                    "class": class_name
                }
                data_list.append(data_dict)  # 将字典添加到列表中

# 打印部分数据，验证结果
for item in data_list[:5]:  # 只显示前5条
    print(item)


{'data': 'How was the professor Nancy Carlin for TA 10 and Queer Art with lark alder', 'class': 'teacher_query'}
{'data': 'who created rumibot', 'class': 'teacher_query'}
{'data': 'who are the professors for world history and us history', 'class': 'teacher_query'}
{'data': 'Who is the horniest person on campus', 'class': 'teacher_query'}
{'data': 'best general education teachers for required classes?', 'class': 'teacher_query'}


In [2]:
import os
import tiktoken
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode, NodeWithScore
from llama_index.core.retrievers import BaseRetriever
from tqdm import tqdm
from typing import Any, List
from llama_index.core import (
    QueryBundle,
    PromptTemplate,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)

# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
# 分词器
tokenizer = tiktoken.get_encoding("cl100k_base")

# 全局设
Settings.embed_model = embed_model
Settings.tokenizer = tokenizer
Settings.llm = None

  from .autonotebook import tqdm as notebook_tqdm


LLM is explicitly disabled. Using MockLLM.


In [3]:
from llama_index.core.schema import TextNode

node_list=[]

for item in data_list:
    text = item['data']
    node = TextNode(text=text, metadata={'class':item['class']})
    node_list.append(node)

len(node_list)

790

In [4]:
# check if storage already exists
PERSIST_DIR = "./baseline_index"
if not os.path.exists(PERSIST_DIR):
    # load the documents and create the index
    index = VectorStoreIndex(node_list)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

In [8]:
query = "I don't have enough credits. Can you help me find out which courses in the school are worth 3 credits?"
vector_retriever = index.as_retriever(similarity_top_k=5)
nodes = vector_retriever.retrieve(query)

nodes

[NodeWithScore(node=TextNode(id_='896646fd-8d14-4a96-b059-5c7f19cccb6f', embedding=None, metadata={'class': 'school_cost'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='how long do I have before my credits expire?', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.6362690899422986),
 NodeWithScore(node=TextNode(id_='4af043d6-341d-465b-8fcd-cffe32909fc6', embedding=None, metadata={'class': 'class_select'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='what classes do I need to complete as a transfer student studying finance and minoring in marketing', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.6232114639331159),
 NodeWithScore(node=TextNode(id_

In [14]:
# 用于累加每个 class 的 score
class_scores = {}

# 累加每个 class 的 score
for node in nodes:
    class_name = node.metadata['class']  # 获取 class 信息
    score = node.score  # 获取该节点的 score
    
    # 如果该 class 已存在于字典中，累加 score，否则初始化
    if class_name in class_scores:
        class_scores[class_name] += score
    else:
        class_scores[class_name] = score

# 找到总分最高的 class
max_class = max(class_scores, key=class_scores.get)
max_score = class_scores[max_class]

# 输出总分最高的 class 及其得分
print(f"得分最高的 class: {max_class}，总得分: {max_score}")

得分最高的 class: class_select，总得分: 2.4863932691104367
