# Machine Learning Knowledge Base System

This notebook combines five Python scripts into a complete system for:
1. Processing a corpus file
2. Vectorizing text chunks
3. Creating and querying a vector database
4. Running inference with a language model
5. A complete knowledge base Q&A system

## 1. Corpus Processing

This cell reads and splits the corpus file into chunks.

In [None]:
# 1.语料库.py
with open('语料库.txt', 'r', encoding='utf-8') as fp:
    data = fp.read()

chunk_list = data.split("\n\n")
chunk_list = [chunk for chunk in chunk_list if chunk]
print(f"Found {len(chunk_list)} chunks in the corpus.")
print("First chunk:", chunk_list[0] if chunk_list else "No chunks found")

## 2. Text Vectorization

This cell demonstrates how to convert text chunks into embeddings using Ollama's API.

In [None]:
# 2.向量化.py
import requests
import functools

def file_chunk_list():
    with open('语料库.txt', 'r', encoding='utf-8') as fp:
        data = fp.read()

    chunk_list = data.split('\n\n')
    clist = [chunk for chunk in chunk_list if chunk]
    return clist

def ollama_api(text):
    res = requests.post(
        url="http://127.0.0.1:11434/api/embeddings",
        json={
            "model":"nomic-embed-text",
            "prompt": text
        }
    )
    embedding = res.json()['embedding']
    return embedding

def run_vectorization():
    clist = file_chunk_list()
    if not clist:
        print("No chunks found in corpus.")
        return
    
    # Just process the first chunk as an example
    chunk = clist[0]
    print("Processing chunk:", chunk[:50] + "..." if len(chunk) > 50 else chunk)
    vector = ollama_api(chunk)
    print(f"Vector length: {len(vector)}")
    print(f"First 5 vector values: {vector[:5]}")

run_vectorization()

## 3. Vector Database

This cell demonstrates creating and querying a ChromaDB vector database.

In [None]:
# 3.向量数据库.py
import chromadb
import uuid
import requests

def ollama_api(text):
    res = requests.post(
        url="http://127.0.0.1:11434/api/embeddings",
        json={
            "model":"nomic-embed-text",
            "prompt": text
        }
    )
    embedding = res.json()['embedding']
    return embedding

def run_vector_database():
    # Initialize ChromaDB
    client = chromadb.PersistentClient(path="db/chroma_demo")
    collection = client.get_or_create_collection("collection_v1")

    # Sample documents
    documents = ["线性回归", "随机森林", "卷积神经网络 (CNN)", "支持向量机 (SVM)", "K-近邻 (KNN)", "梯度提升树 (GBDT)"]
    ids = [str(uuid.uuid4()) for _ in documents]
    embeddings = [ollama_api(text) for text in documents]

    # Add to collection
    collection.add(
        ids=ids,
        documents=documents,
        embeddings=embeddings
    )

    # Query example
    qs = "随机森林"
    qs_embeddings = ollama_api(qs)

    res = collection.query(query_embeddings=[qs_embeddings, ], query_texts=qs, n_results=2)
    print("Query results:", res)

run_vector_database()

## 4. Inference Model

This cell demonstrates using Ollama's generate API for text generation.

In [None]:
# 4.推理模型.py
import requests

def run_inference():
    print("输入问题（示例：'解释一下机器学习的基本概念'）：")
    prompt = input()

    response = requests.post(
        url="http://127.0.0.1:11434/api/generate",
        json={
            "model":"deepseek-r1:1.5b",
            "prompt":prompt,
            "stream":False
        }
    )

    res = response.json()['response']
    print("模型回答:", res)

# Uncomment to run
# run_inference()

## 5. Complete Knowledge Base System

This cell combines all components into a complete Q&A system with vector database retrieval and LLM generation.

In [None]:
# 5.知识库.py
import chromadb
import uuid
import requests

def file_chunk_list():
    with open('语料库.txt', 'r', encoding='utf-8') as fp:
        data = fp.read()

    chunk_list = data.split('\n\n')
    clist = [chunk for chunk in chunk_list if chunk]
    return clist

def ollama_api(text):
    res = requests.post(
        url="http://127.0.0.1:11434/api/embeddings",
        json={
            "model":"nomic-embed-text",
            "prompt": text
        }
    )
    embedding = res.json()['embedding']
    return embedding

def ollama_generate_api(prompt):
    response = requests.post(
        url="http://127.0.0.1:11434/api/generate",
        json={
            "model": "deepseek-llm:7b",
            "prompt": prompt,
            "stream": False
        }
    )
    res = response.json()['response']
    return res

def initialize_knowledge_base():
    client = chromadb.PersistentClient(path="db/chroma_demo")

    # Delete existing collection if needed
    try:
        client.delete_collection("collection_v2")
    except:
        pass
        
    collection = client.get_or_create_collection("collection_v2")

    # Sample documents (in a real system, you'd use your corpus chunks)
    documents = ["线性回归", "随机森林", "卷积神经网络 (CNN)", "支持向量机 (SVM)", "K-近邻 (KNN)", "梯度提升树 (GBDT)"]
    ids = [str(uuid.uuid4()) for _ in range(len(documents))]
    embeddings = [ollama_api(text) for text in documents]

    collection.add(
        ids=ids,
        documents=documents,
        embeddings=embeddings
    )
    print("Knowledge base initialized with", len(documents), "documents.")

def run_knowledge_base():
    initialize_knowledge_base()
    
    while True:
        print("\n输入问题（输入q退出）：")
        qs = input().strip()

        if qs.lower() == 'q':
            print("退出问答系统。")
            break

        if not qs:
            print("请输入有效问题。")
            continue

        qs_embedding = ollama_api(qs)

        client = chromadb.PersistentClient(path="db/chroma_demo")
        collection = client.get_or_create_collection("collection_v2")
        res = collection.query(query_embeddings=[qs_embedding, ], query_texts=qs, n_results=2)
        result = res["documents"][0]
        context = "\n".join(result)
        prompt = f"""你是一个机器学习知识问答机器人，任务是只根据参考信息回答用户问题，不需要除了我给的参考信息之外的信息辅助，如果我给的参考信息没有这方面的知识，请回复"不知道"，不要去杜撰任何信息，不要回答不在参考信息外的答案，所有回答请用中文回答
        参考信息:{context}，来回答问题:{qs}"""

        answer = ollama_generate_api(prompt)
        print("\n回答:", answer)

# Uncomment to run the complete system
# run_knowledge_base()

## How to Use This Notebook

1. Run the cells in order to initialize all components
2. The last cell contains the complete knowledge base system - uncomment `run_knowledge_base()` to use it
3. Make sure you have:
   - Ollama running locally
   - The required models downloaded (`nomic-embed-text` and `deepseek-llm:7b`)
   - A `语料库.txt` file in the same directory
   - ChromaDB installed (`pip install chromadb`)