In [5]:
import pickle
from pyvi import ViTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# --- Đọc danh sách stopwords ---
def load_stopwords(filepath="../../../dataset\\sub_model\\vietnamese-stopwords-dash.txt"):
    return set(open(filepath, encoding="utf-8").read().split("\n"))

# --- Tải model ---
def load_model(filename="model_clustering_fac.pkl"):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [6]:
# --- Hàm xử lý token ---
def get_token(text, vietnamese_stopwords):
    text_tokens = ViTokenizer.tokenize(text).split()
    filtered_text = [word for word in text_tokens if word not in vietnamese_stopwords]
    return filtered_text

# --- Tìm cụm phù hợp cho tiện ích ---
def find_best_clusters(user_input, bm25, vectorizer, tfidf_matrix, cluster_texts, vietnamese_stopwords):
    # Tiền xử lý user input
    processed_user_input = [" ".join(get_token(query, vietnamese_stopwords)) for query in user_input]

    # BM25 Scores
    bm25_scores = {cluster: 0 for cluster in cluster_texts.keys()}
    for query in processed_user_input:
        scores = bm25.get_scores(query.split())
        for i, cluster in enumerate(cluster_texts.keys()):
            bm25_scores[cluster] += scores[i]

    # Cosine Similarity Scores
    cosine_scores = {cluster: 0 for cluster in cluster_texts.keys()}
    for query in processed_user_input:
        query_vec = vectorizer.transform([query])
        scores = cosine_similarity(query_vec, tfidf_matrix)[0]

        for i, cluster in enumerate(cluster_texts.keys()):
            cosine_scores[cluster] += scores[i]

    # Kết hợp điểm BM25 + Cosine Similarity
    final_scores = {cluster: bm25_scores[cluster] + cosine_scores[cluster] for cluster in cluster_texts.keys()}

    # Sắp xếp cụm theo độ tương đồng
    sorted_clusters = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_clusters
    
# Tải model đã lưu
bm25, vectorizer, tfidf_matrix, cluster_texts = load_model()

vietnamese_stopwords = load_stopwords()

def find_clusters(user_input): # user_input: list of queries 
    sorted_clusters = find_best_clusters(user_input, bm25, vectorizer, tfidf_matrix, cluster_texts, vietnamese_stopwords)
    return sorted_clusters


In [7]:
find_clusters("TV màn hình phẳng, két an toàn")

[('5', 0.0),
 ('1', 0.0),
 ('12', 0.0),
 ('11', 0.0),
 ('17', 0.0),
 ('7', 0.0),
 ('16', 0.0),
 ('15', 0.0),
 ('2', 0.0),
 ('8', 0.0),
 ('13', 0.0),
 ('3', 0.0),
 ('10', 0.0),
 ('14', 0.0),
 ('6', 0.0),
 ('4', 0.0),
 ('18', 0.0),
 ('9', 0.0)]

In [8]:
import pickle
from pyvi import ViTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# --- Đọc danh sách stopwords ---
def load_stopwords(filepath="D:\\graduate_dissertation\\code\\processing\\vietnamese-stopwords-dash.txt"):
    return set(open(filepath, encoding="utf-8").read().split("\n"))

# --- Tải model ---
def load_model(filename="D:\\graduate_dissertation\\code\\processing\\model_clustering_fac.pkl"):
    with open(filename, "rb") as f:
        return pickle.load(f)

# --- Hàm xử lý token ---
def get_token(text, vietnamese_stopwords):
    text_tokens = ViTokenizer.tokenize(text).split()
    filtered_text = [word for word in text_tokens if word not in vietnamese_stopwords]
    return filtered_text

# --- Tìm cụm phù hợp cho tiện ích ---
def find_best_clusters(user_input, bm25, vectorizer, tfidf_matrix, cluster_texts, vietnamese_stopwords):
    # Tiền xử lý user input
    processed_user_input = [" ".join(get_token(query, vietnamese_stopwords)) for query in user_input]

    # BM25 Scores
    bm25_scores = {cluster: 0 for cluster in cluster_texts.keys()}
    for query in processed_user_input:
        scores = bm25.get_scores(query.split())
        for i, cluster in enumerate(cluster_texts.keys()):
            bm25_scores[cluster] += scores[i]

    # Cosine Similarity Scores
    cosine_scores = {cluster: 0 for cluster in cluster_texts.keys()}
    for query in processed_user_input:
        query_vec = vectorizer.transform([query])
        scores = cosine_similarity(query_vec, tfidf_matrix)[0]

        for i, cluster in enumerate(cluster_texts.keys()):
            cosine_scores[cluster] += scores[i]

    # Kết hợp điểm BM25 + Cosine Similarity
    final_scores = {cluster: bm25_scores[cluster] + cosine_scores[cluster] for cluster in cluster_texts.keys()}

    # Sắp xếp cụm theo độ tương đồng
    sorted_clusters = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_clusters
    
# Tải model đã lưu
bm25, vectorizer, tfidf_matrix, cluster_texts = load_model()

vietnamese_stopwords = load_stopwords()

def fit(user_input): # user_input: list of queries 
    sorted_clusters = find_best_clusters(user_input, bm25, vectorizer, tfidf_matrix, cluster_texts, vietnamese_stopwords)
    return sorted_clusters


In [10]:
find_clusters(['két an toàn'])

[('8', 4.109821587166924),
 ('1', 2.419235866355204),
 ('6', 1.8557057600004014),
 ('5', 0.0),
 ('12', 0.0),
 ('11', 0.0),
 ('17', 0.0),
 ('7', 0.0),
 ('16', 0.0),
 ('15', 0.0),
 ('2', 0.0),
 ('13', 0.0),
 ('3', 0.0),
 ('10', 0.0),
 ('14', 0.0),
 ('4', 0.0),
 ('18', 0.0),
 ('9', 0.0)]