In [None]:
import json
import numpy as np
from pyvi import ViTokenizer
from underthesea import sent_tokenize
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from tqdm import tqdm


In [None]:
data_json = []
with open('/content/drive/MyDrive/data/data_TL_check.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data_json.append(json.loads(line))

In [None]:
# Lôad w2v
w2v = KeyedVectors.load_word2vec_format("/content/drive/MyDrive/data/vi.vec")
vocab = w2v.index_to_key

In [None]:
summaries = []
# Tóm tắt từng bài
for item in tqdm(data_json, desc="Summarizing articles with KMeans"):
    content = item.get('content', "")
    if not content.strip():
        summaries.append({"content": content, "summary": ""})
        continue
    sentences = sent_tokenize(content.lower().replace('\n', '. ').strip())
    sentences = [s.strip() for s in sentences if s.strip()]
    if not sentences:
        summaries.append({"content": content, "summary": ""})
        continue

    # Biểu diễn vecto
    X = []
    for s in sentences:
        words = ViTokenizer.tokenize(s).split()
        vec = np.zeros((100,))
        for w in words:
            if w in vocab:
                vec += w2v[w]
        X.append(vec)
    X = np.array(X)
    # Số cụm (câu trong summary)
    n_clusters = min(5, len(sentences))  # tối đa 5 câu hoặc ít hơn nếu bài ngắn
    kmeans = KMeans(n_clusters=n_clusters, n_init=5, random_state=42).fit(X)

    # Chọn câu gần 
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)

    # Sắp xếp theo thứ tự 
    summary_sentence_idx = sorted(closest)
    summary_sentences = [sentences[i] for i in summary_sentence_idx]
    summary = ' '.join(summary_sentences)
    summaries.append({"content": content, "summary": summary})

In [None]:

for i, item in enumerate(summaries[:5]):
    print(f"\n==== Article {i+1} ====")
    print("Content:\n", item['content'][:500], "..." if len(item['content']) > 500 else "")
    print("\nSummary:\n", item['summary'])
    print("========================")