In [None]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# 假設我們有三個文檔
documents = [
    "I love machine learning and deep learning.",
    "Machine learning is great for AI applications.",
    "Deep learning improves AI performance."
]

# 建立 TF-IDF 向量器
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# 轉換為可讀的矩陣格式
print(tfidf_matrix.toarray())

# 顯示對應的詞彙
print(vectorizer.get_feature_names_out())

# 顯示了每個單詞在不同文檔中的 TF-IDF 值，值越大代表該詞對該文檔的重要性越高。

[[0.         0.46869865 0.         0.3564574  0.         0.
  0.         0.         0.55364194 0.46869865 0.3564574  0.        ]
 [0.32412354 0.         0.4261835  0.         0.4261835  0.4261835
  0.         0.4261835  0.25171084 0.         0.32412354 0.        ]
 [0.40619178 0.         0.         0.40619178 0.         0.
  0.53409337 0.         0.31544415 0.         0.         0.53409337]]
['ai' 'and' 'applications' 'deep' 'for' 'great' 'improves' 'is' 'learning'
 'love' 'machine' 'performance']


In [26]:
# TF-IDF + 向量搜尋
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
import numpy as np

# 文字數據庫
documents = [
    "機器學習是人工智慧的一部分。",
    "深度學習是一種機器學習方法。",
    "自然語言處理讓機器理解語言。",
    "搜尋引擎依賴於關鍵字匹配和語義搜尋。"
]

# 🔹 TF-IDF 關鍵字搜尋
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(documents)

# 🔹 向量搜尋（BERT）
model = SentenceTransformer('all-MiniLM-L6-v2')  # 輕量級的語意向量（Sentence Embedding）模型
vector_embeddings = model.encode(documents)

# 🔹 用戶查詢
query = "AI學習"
query_embedding = model.encode(query)

# 計算向量相似度
cosine_scores = util.pytorch_cos_sim(query_embedding, vector_embeddings)

# 輸出最相似的結果
top_result = np.argmax(cosine_scores.numpy())
print("最佳匹配結果:", documents[top_result])


最佳匹配結果: 機器學習是人工智慧的一部分。


In [27]:
tfidf_matrix.toarray()

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]])

In [28]:
vector_embeddings.shape

(4, 384)

In [29]:
query_embedding

array([-1.67366024e-02,  2.96204239e-02,  8.43450502e-02, -7.01751560e-03,
        3.63265164e-02,  2.30699070e-02,  1.02762341e-01,  2.63543800e-02,
        6.61706403e-02, -3.70286452e-03,  9.89213288e-02, -8.79850015e-02,
        5.88534474e-02, -4.63246591e-02, -5.71580604e-02,  1.93370543e-02,
       -4.19089608e-02,  3.62043642e-02, -6.08672537e-02, -3.35762314e-02,
       -7.40542412e-02,  1.26944901e-02,  2.06775740e-02,  2.08017677e-02,
       -2.09222101e-02,  7.01760426e-02, -4.59699659e-03, -2.28465255e-02,
        3.40544991e-02, -4.90138792e-02, -1.16077717e-02,  2.63286438e-02,
        4.82034311e-02, -5.11266440e-02,  2.31892671e-02,  3.26707475e-02,
       -4.24242020e-02, -5.91811761e-02,  7.03726113e-02,  4.73642200e-02,
       -2.74939071e-02, -6.43637851e-02,  4.27382365e-02, -1.48509264e-01,
        6.37557283e-02,  9.09825340e-02, -5.74565195e-02, -2.46460214e-02,
        7.51867285e-03,  1.82010327e-02, -1.19300142e-01,  2.09909715e-02,
       -2.63982154e-02,  

In [30]:
cosine_scores

tensor([[0.4542, 0.4536, 0.3935, 0.4256]])

In [31]:
np.random.choice([0, 1, 2])

2

In [2]:
# SentenceTransformer
from sentence_transformers import SentenceTransformer
# 文字轉向量（Embedding）
model = SentenceTransformer("all-MiniLM-L6-v2")  # 載入模型
embeddings = model.encode(["你好", "今天天氣很好"])  # 轉換向量
print(embeddings.shape)  # (2, 384) -> 兩個句子，每個向量 384 維

(2, 384)


In [3]:
# 計算文本相似度 
from sklearn.metrics.pairwise import cosine_similarity

sim_score = cosine_similarity([embeddings[0]], [embeddings[1]])  
print(sim_score)  # 0~1 之間，數值越高表示語義越相似

[[0.5280443]]
