In [1]:
import numpy as np
import random
import os
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from collections import defaultdict
os.environ["OMP_NUM_THREADS"] = "1"

In [2]:
def process_text_segment(text):
    return text.strip().replace(" ", "_") if text.strip() else None

In [3]:
def parse_segmented_corpus(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    processed_sentences = []

    for line in lines:
        line = line.strip()
        if not line:
            continue

        tokens = []
        current_pos = 0
        line_length = len(line)

        while current_pos < line_length:
            next_phrase_start = line.find("<phrase_Q=", current_pos)

            if next_phrase_start == -1:
                remaining_token = process_text_segment(line[current_pos:])
                if remaining_token:
                    tokens.append(remaining_token)
                break

            if next_phrase_start > current_pos:
                pre_phrase_text = line[current_pos:next_phrase_start]
                pre_phrase_token = process_text_segment(pre_phrase_text)
                if pre_phrase_token:
                    tokens.append(pre_phrase_token)

            phrase_header_end = line.find(">", next_phrase_start)
            if phrase_header_end == -1:
                remaining_token = process_text_segment(line[current_pos:])
                if remaining_token:
                    tokens.append(remaining_token)
                break

            phrase_end = line.find("</phrase>", phrase_header_end)
            if phrase_end == -1:
                remaining_token = process_text_segment(line[current_pos:])
                if remaining_token:
                    tokens.append(remaining_token)
                break

            phrase = line[phrase_header_end + 1:phrase_end]
            if phrase:
                phrase_token = phrase.replace(" ", "_")
                tokens.append(phrase_token)

            current_pos = phrase_end + len("</phrase>")

        if tokens:
            processed_sentences.append(tokens)

    return processed_sentences

In [4]:
def train_word2vec(sentences, vector_size=100, window=5, min_count=1):
    model = Word2Vec(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=4
    )
    return model

In [5]:
file_path = "./AutoPhrase/models/DBLP/segmentation.txt"

sentences = parse_segmented_corpus(file_path)
print("Total number of sentences:", len(sentences))

for i, sentence in enumerate(sentences[:3]):
    print(f"Sentence {i + 1}: {sentence}")

Total number of sentences: 5000
Sentence 1: ['OQL', '[', 'C++', ']:', 'Extending', 'C++', 'with_an_Object_Query_Capability.']
Sentence 2: ['Transaction_Management', 'in', 'Multidatabase_Systems', '.']
Sentence 3: ['Overview', 'of_the_ADDS_System.']


In [6]:
model = train_word2vec(sentences)
print(f"Total vocabulary size: {len(model.wv.index_to_key)}")

print("Example vectors:")
for sentence in sentences[:1]:
    for token in sentence[:2]:
        print(f"\nToken: {token}")
        print(f"Vector shape: {model.wv[token].shape}")
        print(f"First 5 dimensions: {model.wv[token][:5]}")

Total vocabulary size: 23525
Example vectors:

Token: OQL
Vector shape: (100,)
First 5 dimensions: [-0.00332852  0.00350538 -0.00760019  0.0047511   0.00425787]

Token: [
Vector shape: (100,)
First 5 dimensions: [ 4.4157873e-03 -1.9596594e-03  9.1175600e-05 -8.6908219e-03
  8.6680157e-03]


In [7]:
def perform_clustering_kmeans(model, num_clusters=5):    
    vectors = []
    words = []
    for word in model.wv.index_to_key:
        vectors.append(model.wv[word])
        words.append(word)
    
    X = np.array(vectors)
    print(X.shape)
    
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    
    cluster_dict = defaultdict(list)
    for word, cluster in zip(words, clusters):
        cluster_dict[cluster].append(word)
    
    return cluster_dict

In [8]:
def perform_clustering_gmm(model, num_clusters=6):
    vectors = []
    words = []
    for word in model.wv.index_to_key:
        vectors.append(model.wv[word])
        words.append(word)
    
    X = np.array(vectors)

    gmm = GaussianMixture(n_components=num_clusters, random_state=42)
    clusters = gmm.fit_predict(X)

    cluster_dict = defaultdict(list)
    for word, cluster in zip(words, clusters):
        cluster_dict[cluster].append(word)
    
    return cluster_dict

In [9]:
def print_cluster_samples(cluster_dict, num_samples=20):
    print("Cluster Analysis Results:")
    print("=" * 50)
    
    for cluster_id in sorted(cluster_dict.keys()):
        words = cluster_dict[cluster_id]
        samples = words if len(words) <= num_samples else random.sample(words, num_samples)
        
        print(f"\nCluster {cluster_id + 1} (Total words: {len(words)}):")
        print("-" * 50)
        for i, word in enumerate(samples, 1):
            print(f"{i}. {word.replace('_', ' ')}", end="\n" if i % 5 == 0 else " | ")
        print("\n")

#### Use KMeans to perform clustering and randomly print 20 phrases in each cluster.

In [10]:
num_clusters = 6
cluster_dict = perform_clustering_kmeans(model, num_clusters)
print_cluster_samples(cluster_dict, num_samples=20)

(23525, 100)
Cluster Analysis Results:

Cluster 1 (Total words: 6498):
--------------------------------------------------
1. . The method is based on manual | 2. they save space as well as time and also facilitate operations such as search. Examples are given of the use of these | 3. leafs | 4. Scans as a | 5. operators and
6. protocols have quietly vanished,, and the | 7. Conventional | 8. and the projector are moved at the same time. | 9. few years have seen a | 10. Novel
11. structure of the | 12. among designers and planners, no one may have all of the | 13. actively studied | 14. constraints for which we use a | 15. Attention System for
16. (but only to instructors). The file with the figures and the | 17. the Understandability of | 18. RTSP | 19. queries gives not only an | 20. Separierung und



Cluster 2 (Total words: 3814):
--------------------------------------------------
1. to fit the bore | 2. approximation algorithm | 3. partitions | 4. collision avoidance | 5. System Dev

#### Use GaussianMixture to perform clustering and randomly print 20 phrases in each cluster.

In [11]:
num_clusters = 6
cluster_dict = perform_clustering_gmm(model, num_clusters)
print_cluster_samples(cluster_dict, num_samples=20)

Cluster Analysis Results:

Cluster 1 (Total words: 6483):
--------------------------------------------------
1. retrieving information | 2. Contours and Surfaces. | 3. data acquisition | 4. process is to perform a | 5. approach is an appealing way to depict people in a
6. für Lernbehinderte und Hochbegabte. | 7. partition | 8. of 2D images and | 9. Protocols, | 10. Object Identity
11. Book | 12. . Also, it can be composed of combinations of point and | 13. performance study | 14. user input | 15. mehrschichtige
16. presents a system for | 17. -use 3D | 18. , Beschreibungs- und Ausführungsmodell | 19. Directory | 20. chosen for our



Cluster 2 (Total words: 3576):
--------------------------------------------------
1. Relational Database | 2. also grateful for the detailed comments provided by the other | 3. Formalization | 4. Computer Graphics | 5. The Baseline
6. Topography | 7. degree | 8. procedural | 9. Cosine | 10. Communications.
11. and look at several properties of | 12. intens

We observe that for Clusters 3 and 5, the phrases in these clusters are nearly identical across the two different clustering methods. Many phrases in these clusters consist of only a single word or are associated with prepositions or punctuation. For the other four clusters, there are some differences between KMeans and GMM; however, each KMeans cluster could correspond to a specific cluster in GMM.