In [None]:
import numpy as np
import pandas as pd
import requests
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models

# Q1. Embedding the query
print("=== Q1. Embedding the query ===")

# 初始化 TextEmbedding 模型
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

# 嵌入查詢
query = 'I just discovered the course. Can I join now?'
query_embedding = list(embedding_model.embed([query]))[0]

print(f"嵌入向量大小: {query_embedding.shape}")
print(f"最小值: {np.min(query_embedding):.3f}")

# 驗證向量是否已正規化
print(f"向量長度: {np.linalg.norm(query_embedding):.3f}")

In [None]:
# Q2. Cosine similarity with another vector
print("\n=== Q2. Cosine similarity with another vector ===")

doc = 'Can I still join the course after the start date?'
doc_embedding = list(embedding_model.embed([doc]))[0]

# 計算餘弦相似度
cosine_similarity = np.dot(query_embedding, doc_embedding)
print(f"餘弦相似度: {cosine_similarity:.3f}")


In [None]:
# Q3. Ranking by cosine
print("\n=== Q3. Ranking by cosine ===")

documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
     'section': 'General course-related questions',
     'question': 'Course - Can I still join the course after the start date?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
     'section': 'General course-related questions',
     'question': 'Course - Can I follow the course after it finishes?',
     'course': 'data-engineering-zoomcamp'},
    {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first Office Hours live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon't forget to register in DataTalks.Club's Slack and join the channel.",
     'section': 'General course-related questions',
     'question': 'Course - When will the course start?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
     'section': 'General course-related questions',
     'question': 'Course - What can I do before the course starts?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
     'section': 'General course-related questions',
     'question': 'How can we contribute to the course?',
     'course': 'data-engineering-zoomcamp'}
]

# 計算所有文檔的嵌入
text_embeddings = []
for doc in documents:
    text_emb = list(embedding_model.embed([doc['text']]))[0]
    text_embeddings.append(text_emb)

# 轉換為矩陣並計算餘弦相似度
V = np.array(text_embeddings)
similarities = V.dot(query_embedding)

print("各文檔與查詢的相似度:")
for i, sim in enumerate(similarities):
    print(f"文檔 {i}: {sim:.3f}")

highest_sim_index = np.argmax(similarities)
print(f"最高相似度的文檔索引: {highest_sim_index}")


In [None]:

# Q4. Ranking by cosine, version two
print("\n=== Q4. Ranking by cosine, version two ===")

# 計算 question + text 的嵌入
full_text_embeddings = []
for doc in documents:
    full_text = doc['question'] + ' ' + doc['text']
    full_text_emb = list(embedding_model.embed([full_text]))[0]
    full_text_embeddings.append(full_text_emb)

# 轉換為矩陣並計算餘弦相似度
V_full = np.array(full_text_embeddings)
similarities_full = V_full.dot(query_embedding)

print("各文檔 (question+text) 與查詢的相似度:")
for i, sim in enumerate(similarities_full):
    print(f"文檔 {i}: {sim:.3f}")

highest_sim_index_full = np.argmax(similarities_full)
print(f"最高相似度的文檔索引: {highest_sim_index_full}")

if highest_sim_index != highest_sim_index_full:
    print(f"結果不同！Q3: {highest_sim_index}, Q4: {highest_sim_index_full}")
    print("原因：包含問題文本提供了更多上下文信息，提高了相關性匹配")
else:
    print("結果相同")



In [None]:
# Q5. Selecting the embedding model
print("\n=== Q5. Selecting the embedding model ===")

# 使用 TextEmbedding.list_supported_models() 取得支援的「密集嵌入」模型列表
# 這是一個類別方法，不需要初始化 (不用下載模型) 即可呼叫
supported_models = TextEmbedding.list_supported_models()

# 轉換為 pandas DataFrame 以方便閱讀
df = pd.DataFrame(supported_models)

print(f"總共支援 {len(supported_models)} 個模型")

# 為了方便閱讀，只顯示核心欄位：模型名稱(model)、維度(dim)、模型大小(size_in_GB)
print("\n模型列表 (前 15 個):")
print(df[['model', 'dim', 'size_in_GB']].head(15))

# 分析維度
dimensions = sorted(df['dim'].unique())
min_dimension = min(dimensions)

print(f"\n所有維度: {dimensions}")
print(f"最小維度: {min_dimension}")

# 作業選項
options = [128, 256, 384, 512]
print(f"作業選項: {options}")
print(f"Q5 答案: {min_dimension}")



In [None]:
# Q6. Indexing with qdrant
print("\n=== Q6. Indexing with qdrant ===")

# 載入 ML Zoomcamp 文檔
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

ml_documents = []
for course in documents_raw:
    course_name = course['course']
    if course_name != 'machine-learning-zoomcamp':
        continue
    
    for doc in course['documents']:
        doc['course'] = course_name
        ml_documents.append(doc)

print(f"載入了 {len(ml_documents)} 個 ML Zoomcamp 文檔")

# 初始化 Qdrant 客戶端
qd_client = QdrantClient("http://localhost:6333")

# 使用 BAAI/bge-small-en 模型
model_handle = "BAAI/bge-small-en"

# 測試模型維度
small_model = TextEmbedding(model_name=model_handle)
test_emb = list(small_model.embed(["test"]))[0]
EMBEDDING_DIMENSIONALITY = test_emb.shape[0]
print(f"使用模型: {model_handle}")
print(f"模型維度: {EMBEDDING_DIMENSIONALITY}")

collection_name = "ml-zoomcamp-faq"

# 刪除已存在的集合
try:
    qd_client.delete_collection(collection_name=collection_name)
    print("刪除了舊的集合")
except:
    pass

# 創建新集合
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)
print("創建了新集合")

# 準備要索引的點
points = []
for i, doc in enumerate(ml_documents):
    text = doc['question'] + ' ' + doc['text']
    # 使用 Document 物件而不是預先計算向量
    vector = models.Document(text=text, model="BAAI/bge-small-en")
    
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

# 批量插入點
qd_client.upsert(
    collection_name=collection_name,
    points=points
)
print(f"插入了 {len(points)} 個點")

# 定義 vector_search 函數（針對 ML Zoomcamp）
def vector_search_ml(question):
    print('使用 vector_search_ml')
    
    # 使用 query_points 方法，直接傳入文本和模型
    search_results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model="BAAI/bge-small-en"
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    for point in search_results.points:
        results.append(point.payload)
    
    return results, search_results.points

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

# 使用 Ollama 本地模型
def llm(prompt):
    try:
        response = requests.post(
            'http://localhost:11434/api/generate',
            json={
                'model': 'llama3.2',
                'prompt': prompt,
                'stream': False
            },
            timeout=60
        )
        
        if response.status_code == 200:
            return response.json()['response']
        else:
            return f"Ollama 錯誤：HTTP {response.status_code}"
    except requests.exceptions.RequestException as e:
        return f"連線錯誤：{str(e)}"
    except Exception as e:
        return f"未知錯誤：{str(e)}"

def rag_ml(query):
    search_results, raw_results = vector_search_ml(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer, raw_results

# 使用 Q1 的問題進行查詢，並使用 llm function
query_text = 'I just discovered the course. Can I join now?'
print(f"查詢問題: {query_text}")

# 使用完整的 RAG 系統（包含 llm function）
answer, raw_results = rag_ml(query_text)

print(f"RAG 系統回答:")
print(answer)
print()

print(f"向量搜尋結果:")
for i, result in enumerate(raw_results):
    print(f"結果 {i+1}: 分數 = {result.score:.3f}")
    print(f"問題: {result.payload['question']}")
    print(f"文本: {result.payload['text'][:100]}...")
    print()

highest_score = raw_results[0].score if raw_results else 0
print(f"最高分數: {highest_score:.3f}")

print("\n=== 作業答案總結 ===")
print(f"Q1. 最小值: {np.min(query_embedding):.3f}")
print(f"Q2. 餘弦相似度: {cosine_similarity:.3f}")
print(f"Q3. 最高相似度文檔索引: {highest_sim_index}")
print(f"Q4. 最高相似度文檔索引: {highest_sim_index_full}")
print(f"Q5. 最小維度: {EMBEDDING_DIMENSIONALITY}")
print(f"Q6. 最高分數: {highest_score:.3f}")