# **Combination between Word2Vec and TF_IDF**

In [None]:
import json
import os
import py_vncorenlp
import numpy as np
import re
import string
import joblib
from gensim.models import Word2Vec

from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from sklearn.preprocessing import normalize

In [None]:
current_dir = os.getcwd()
PROJECT_DIR = os.path.dirname(current_dir)
data_path = os.path.join(PROJECT_DIR, "dataset", "processed_legal_corpus.json")
MODEL_DIR = "D:/VnCoreNLP" # This is belong to your file location 
DATASET_DIR = os.path.join(PROJECT_DIR, "dataset")
VECTORIZER_DIR = os.path.join(PROJECT_DIR, "vectorizer")

In [None]:
with open(data_path, "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

# Danh sách văn bản
content = []
index = [] # "01/2009/tt-bnn 1" (law_id + article_id)

for document in data:
    law_id = document["law_id"]
    for article in document["articles"]:
        article_id = article["article_id"]
        text = article["processed_in4"]
        clean_tokens = [token for token in text.split() if token not in string.punctuation]
        clean_text = ' '.join(clean_tokens)
        content.append(clean_text)
        index.append(law_id + " " + article_id)
print(len(content))
print(len(index))


In [None]:
tokenized_sentences = [sentence.split() for sentence in content]

In [None]:
# Training
w2v_model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=10000,
    window=5,
    min_count=1,
    sg=0,  # skip-gram sg=1, nếu muốn CBOW thì dùng sg=0
    workers=5,
    epochs=10
)

w2v_model.save(os.path.join(VECTORIZER_DIR, "w2v.model"))

In [None]:
# Load for after use
w2v_model = Word2Vec.load(os.path.join(VECTORIZER_DIR, "w2v.model"))
# Load tf_idf for vectorizer
tfidf_vectorizer = joblib.load(os.path.join(VECTORIZER_DIR, 'tfidf_vectorizer.pkl'))

In [None]:
def get_weighted_sentence_vector(sentence, w2v_model, tfidf_vectorizer):
    tfidf_scores = tfidf_vectorizer.transform([sentence])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_dict = {
        feature_names[col]: tfidf_scores[0, col]
        for col in tfidf_scores.nonzero()[1]
    }

    words = sentence.split()
    word_vecs = []
    weights = []
    for word in words:
        if word in w2v_model.wv and word in tfidf_dict:
            vec = w2v_model.wv[word]
            weight = tfidf_dict[word]
            word_vecs.append(vec * weight)
            weights.append(weight)

    if word_vecs:
        return np.sum(word_vecs, axis=0) / np.sum(weights)
    else:
        return np.zeros(w2v_model.vector_size)

In [None]:
# content: list of sentences
# tokenized_sentences: list of list of words (tokens) in each sentence
# index: payload for database

In [None]:
# 1. Normalize the vectors
sentence_vectors = np.array([
    get_weighted_sentence_vector(sentence, w2v_model, tfidf_vectorizer)
    for sentence in content
])

normalized_vectors = normalize(sentence_vectors)
print(normalized_vectors.shape)

# 2. Connect to Qdrant
client = QdrantClient(host="localhost", port=6333)

# 3. Tạo collection
client.recreate_collection(
    collection_name="Word2Vec_Law_document_retrivial",
    vectors_config=VectorParams(size=normalized_vectors.shape[1], distance=Distance.COSINE) # Using Cosinesimilarity for searching vector
)

# 4. Thêm dữ liệu
payloads = [{"law_id": doc.split(" ")[0], "article_id": doc.split(" ")[1]} for doc in index]
points = [
    {
        "id": i,
        "vector": normalized_vectors[i],
        "payload": payloads[i]
    }
    for i in range(len(content))
]

print(len(points))
print(len(payloads))

In [None]:
# Upload the data to Qdrant collection
for i in range(0, len(points), 100):
    client.upsert(collection_name="Word2Vec_Law_document_retrivial", points=points[i:i+100])

# **USING**

In [None]:
## Initial the segmentation model and the pattern to remove the stop word
vncorenlp_model = py_vncorenlp.VnCoreNLP(save_dir= MODEL_DIR)
with open(os.path.join(DATASET_DIR, "stopwords_processed.txt"), "r", encoding="utf-8") as f:
    stopwords_list = list(map(str.strip, f))

pattern = r"\b(" + "|".join(map(re.escape, stopwords_list)) + r")\b"

In [None]:
def processing_query(query, segmentation_model):
    query_list = vncorenlp_model.word_segment(query) # segment
    query = " ".join(query_list) 
    query = re.sub(pattern, "", query) # remove stop word
    query = re.sub(r"\s+", " ", query).strip() # Remove the redundancy
    query_word = [token for token in query.split() if token not in string.punctuation]
    clean_query = ' '.join(query_word)

    return clean_query


In [None]:
# 5. Nhập câu truy vấn và biến thành vector
query = "Đập phá biển báo “khu vực biên giới” bị phạt thế nào?"
clean_query = processing_query(query, vncorenlp_model)
query_vec = get_weighted_sentence_vector(clean_query, w2v_model, tfidf_vectorizer)
query_vec = normalize(query_vec.reshape(1, -1))[0]

# 6. Truy vấn Qdrant
hits = client.search(
    collection_name="Word2Vec_Law_document_retrivial",
    query_vector=query_vec,
    limit=10
)

# 7. In kết quả
for hit in hits:
    print(f"Score: {hit.score:.4f} | law_id: {hit.payload['law_id']} | article_id: {hit.payload['article_id']} ")