In [1]:
import numpy as np
import random
import os
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from collections import defaultdict
os.environ["OMP_NUM_THREADS"] = "1"

In [2]:
def process_text_segment(text):
    return text.strip().replace(" ", "_") if text.strip() else None

In [3]:
def parse_segmented_corpus(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    processed_sentences = []

    for line in lines:
        line = line.strip()
        if not line:
            continue

        tokens = []
        current_pos = 0
        line_length = len(line)

        while current_pos < line_length:
            next_phrase_start = line.find("<phrase_Q=", current_pos)

            if next_phrase_start == -1:
                remaining_token = process_text_segment(line[current_pos:])
                if remaining_token:
                    tokens.append(remaining_token)
                break

            if next_phrase_start > current_pos:
                pre_phrase_text = line[current_pos:next_phrase_start]
                pre_phrase_token = process_text_segment(pre_phrase_text)
                if pre_phrase_token:
                    tokens.append(pre_phrase_token)

            phrase_header_end = line.find(">", next_phrase_start)
            if phrase_header_end == -1:
                remaining_token = process_text_segment(line[current_pos:])
                if remaining_token:
                    tokens.append(remaining_token)
                break

            phrase_end = line.find("</phrase>", phrase_header_end)
            if phrase_end == -1:
                remaining_token = process_text_segment(line[current_pos:])
                if remaining_token:
                    tokens.append(remaining_token)
                break

            phrase = line[phrase_header_end + 1:phrase_end]
            if phrase:
                phrase_token = phrase.replace(" ", "_")
                tokens.append(phrase_token)

            current_pos = phrase_end + len("</phrase>")

        if tokens:
            processed_sentences.append(tokens)

    return processed_sentences

In [4]:
def train_word2vec(sentences, vector_size=100, window=5, min_count=1):
    model = Word2Vec(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=4
    )
    return model

In [5]:
file_path = "./AutoPhrase/models/DBLP/segmentation.txt"

sentences = parse_segmented_corpus(file_path)
print("Total number of sentences:", len(sentences))

for i, sentence in enumerate(sentences[:3]):
    print(f"Sentence {i + 1}: {sentence}")

Total number of sentences: 5000
Sentence 1: ['OQL', '[', 'C++', ']:', 'Extending', 'C++', 'with_an_Object_Query_Capability.']
Sentence 2: ['Transaction_Management', 'in', 'Multidatabase_Systems', '.']
Sentence 3: ['Overview', 'of_the_ADDS_System.']


In [6]:
model = train_word2vec(sentences)
print(f"Total vocabulary size: {len(model.wv.index_to_key)}")

print("Example vectors:")
for sentence in sentences[:1]:
    for token in sentence[:2]:
        print(f"\nToken: {token}")
        print(f"Vector shape: {model.wv[token].shape}")
        print(f"First 5 dimensions: {model.wv[token][:5]}")

Total vocabulary size: 23525
Example vectors:

Token: OQL
Vector shape: (100,)
First 5 dimensions: [-0.00323956  0.00342402 -0.00773631  0.00460851  0.00436019]

Token: [
Vector shape: (100,)
First 5 dimensions: [ 0.00426744 -0.00132468  0.00022595 -0.00935194  0.00950344]


In [7]:
def perform_clustering_kmeans(model, num_clusters=5):    
    vectors = []
    words = []
    for word in model.wv.index_to_key:
        vectors.append(model.wv[word])
        words.append(word)
    
    X = np.array(vectors)
    print(X.shape)
    
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    
    cluster_dict = defaultdict(list)
    for word, cluster in zip(words, clusters):
        cluster_dict[cluster].append(word)
    
    return cluster_dict

In [8]:
def perform_clustering_gmm(model, num_clusters=6):
    vectors = []
    words = []
    for word in model.wv.index_to_key:
        vectors.append(model.wv[word])
        words.append(word)
    
    X = np.array(vectors)

    gmm = GaussianMixture(n_components=num_clusters, random_state=42)
    clusters = gmm.fit_predict(X)

    cluster_dict = defaultdict(list)
    for word, cluster in zip(words, clusters):
        cluster_dict[cluster].append(word)
    
    return cluster_dict

In [9]:
def print_cluster_samples(cluster_dict, num_samples=20):
    print("Cluster Analysis Results:")
    print("=" * 50)
    
    for cluster_id in sorted(cluster_dict.keys()):
        words = cluster_dict[cluster_id]
        samples = words if len(words) <= num_samples else random.sample(words, num_samples)
        
        print(f"\nCluster {cluster_id + 1} (Total words: {len(words)}):")
        print("-" * 50)
        for i, word in enumerate(samples, 1):
            print(f"{i}. {word.replace('_', ' ')}", end="\n" if i % 5 == 0 else " | ")
        print("\n")

In [10]:
num_clusters = 6
cluster_dict = perform_clustering_kmeans(model, num_clusters)
print_cluster_samples(cluster_dict, num_samples=20)

(23525, 100)
Cluster Analysis Results:

Cluster 1 (Total words: 6388):
--------------------------------------------------
1. Inverse | 2. FM8501: A Verified | 3. triangle mesh | 4. Realisierung von | 5. distributed database
6. decided to start | 7. POCO | 8. type inheritance | 9. Reduction and | 10. Solar
11. subdivision technique. The | 12. are mostly concerned with | 13. Planen | 14. , or transmit classified | 15. shortest paths
16. Automatic Text | 17. under severe changes of | 18. systems in | 19. of DebitCredit and the | 20. lightweight



Cluster 2 (Total words: 4210):
--------------------------------------------------
1. the seats'''' means the seats of Nadia'' | 2. Constraint Services: | 3. on the authors' | 4. change | 5. push
6. MRF | 7. Inconsistent | 8. divided into | 9. curve in 3D space. | 10. career
11. On the Convergence of Analysis and | 12. . The 3D | 13. medial axis | 14. Systeme | 15. distance transform
16. . The specification | 17. C | 18. programmer | 19. Topology

In [11]:
num_clusters = 6
cluster_dict = perform_clustering_gmm(model, num_clusters)
print_cluster_samples(cluster_dict, num_samples=20)

Cluster Analysis Results:

Cluster 1 (Total words: 6369):
--------------------------------------------------
1. didn't need them. What | 2. Outlier | 3. B+-Tree | 4. of Reasoning with | 5. and is meant to be read from beginning to end. It explains how to specify the class of properties known as
6. also refers | 7. temporally | 8. left unsampled. Both the initial set of | 9. of Proofsystems for | 10. to a limited number of people is now instantly retrievable anywhere in the world by anyone with a computer and an
11. images. This | 12. Elimination | 13. Seabottom Surveys. | 14. body | 15. sixth
16. set by certain simple operations. Interconnections between simplicity of computations and | 17. and to compensate for the | 18. differ significantly | 19. Phase Unwrapping | 20. fields and



Cluster 2 (Total words: 3332):
--------------------------------------------------
1. Natural images | 2. approach produces a consistent | 3. Structural Mechanics | 4. on cost and benefit. The segment clea