In [1]:
import numpy as np
import random
import os
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from collections import defaultdict
os.environ["OMP_NUM_THREADS"] = "1"

In [2]:
def process_text_segment(text):
    return text.strip().replace(" ", "_") if text.strip() else None

In [3]:
def parse_segmented_corpus(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    processed_sentences = []

    for line in lines:
        line = line.strip()
        if not line:
            continue

        tokens = []
        current_pos = 0
        line_length = len(line)

        while current_pos < line_length:
            next_phrase_start = line.find("<phrase_Q=", current_pos)

            if next_phrase_start == -1:
                remaining_token = process_text_segment(line[current_pos:])
                if remaining_token:
                    tokens.append(remaining_token)
                break

            if next_phrase_start > current_pos:
                pre_phrase_text = line[current_pos:next_phrase_start]
                pre_phrase_token = process_text_segment(pre_phrase_text)
                if pre_phrase_token:
                    tokens.append(pre_phrase_token)

            phrase_header_end = line.find(">", next_phrase_start)
            if phrase_header_end == -1:
                remaining_token = process_text_segment(line[current_pos:])
                if remaining_token:
                    tokens.append(remaining_token)
                break

            phrase_end = line.find("</phrase>", phrase_header_end)
            if phrase_end == -1:
                remaining_token = process_text_segment(line[current_pos:])
                if remaining_token:
                    tokens.append(remaining_token)
                break

            phrase = line[phrase_header_end + 1:phrase_end]
            if phrase:
                phrase_token = phrase.replace(" ", "_")
                tokens.append(phrase_token)

            current_pos = phrase_end + len("</phrase>")

        if tokens:
            processed_sentences.append(tokens)

    return processed_sentences

In [4]:
def train_word2vec(sentences, vector_size=100, window=5, min_count=1):
    model = Word2Vec(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=4
    )
    return model

In [5]:
file_path = "./AutoPhrase/models/DBLP/segmentation.txt"

sentences = parse_segmented_corpus(file_path)
print("Total number of sentences:", len(sentences))

for i, sentence in enumerate(sentences[:3]):
    print(f"Sentence {i + 1}: {sentence}")

Total number of sentences: 5000
Sentence 1: ['OQL', '[', 'C++', ']:', 'Extending', 'C++', 'with_an_Object_Query_Capability.']
Sentence 2: ['Transaction_Management', 'in', 'Multidatabase_Systems', '.']
Sentence 3: ['Overview', 'of_the_ADDS_System.']


In [6]:
model = train_word2vec(sentences)
print(f"Total vocabulary size: {len(model.wv.index_to_key)}")

print("Example vectors:")
for sentence in sentences[:1]:
    for token in sentence[:2]:
        print(f"\nToken: {token}")
        print(f"Vector shape: {model.wv[token].shape}")
        print(f"First 5 dimensions: {model.wv[token][:5]}")

Total vocabulary size: 23525
Example vectors:

Token: OQL
Vector shape: (100,)
First 5 dimensions: [-0.00329331  0.00341695 -0.00770479  0.00470063  0.00434671]

Token: [
Vector shape: (100,)
First 5 dimensions: [ 0.0034336  -0.0016726   0.00129306 -0.00726168  0.00917324]


In [7]:
def perform_clustering_kmeans(model, num_clusters=5):    
    vectors = []
    words = []
    for word in model.wv.index_to_key:
        vectors.append(model.wv[word])
        words.append(word)
    
    X = np.array(vectors)
    print(X.shape)
    
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    
    cluster_dict = defaultdict(list)
    for word, cluster in zip(words, clusters):
        cluster_dict[cluster].append(word)
    
    return cluster_dict

In [8]:
def perform_clustering_gmm(model, num_clusters=6):
    vectors = []
    words = []
    for word in model.wv.index_to_key:
        vectors.append(model.wv[word])
        words.append(word)
    
    X = np.array(vectors)

    gmm = GaussianMixture(n_components=num_clusters, random_state=42)
    clusters = gmm.fit_predict(X)

    cluster_dict = defaultdict(list)
    for word, cluster in zip(words, clusters):
        cluster_dict[cluster].append(word)
    
    return cluster_dict

In [9]:
def print_cluster_samples(cluster_dict, num_samples=20):
    print("Cluster Analysis Results:")
    print("=" * 50)
    
    for cluster_id in sorted(cluster_dict.keys()):
        words = cluster_dict[cluster_id]
        samples = words if len(words) <= num_samples else random.sample(words, num_samples)
        
        print(f"\nCluster {cluster_id + 1} (Total words: {len(words)}):")
        print("-" * 50)
        for i, word in enumerate(samples, 1):
            print(f"{i}. {word.replace('_', ' ')}", end="\n" if (i + 1) % 5 == 0 else " | ")
        print("\n")

In [10]:
num_clusters = 6
cluster_dict = perform_clustering_kmeans(model, num_clusters)
print_cluster_samples(cluster_dict, num_samples=20)

(23525, 100)
Cluster Analysis Results:

Cluster 1 (Total words: 6573):
--------------------------------------------------
1. point bases, these images are | 2. and Development of | 3. development is known as | 4. Networks of Workstations
5. image of the object recorded by a | 6. radially symmetric | 7. for all recorded images. These | 8. as the problem of | 9. By
10. trust | 11. which are directly used, or are the basis, for | 12. where he studied as a | 13. Scene Analysis | 14. Systems of Reductions
15. Scalable | 16. Resolution Theorem Proving | 17. Jasmine | 18. fashion | 19. approaches the desired
20. that applies for any isovalue. As | 


Cluster 2 (Total words: 6459):
--------------------------------------------------
1. The TSQL2 | 2. Legislation | 3. unambiguously and dynamically. The | 4. Critical Realism
5. , Sequencing and Transformations. | 6. . Lippman dispells the misinformation and | 7. , the system achieves a | 8. Ethical | 9. specific parameters
10. Ensuring | 11. prop

In [11]:
num_clusters = 6
cluster_dict = perform_clustering_gmm(model, num_clusters)
print_cluster_samples(cluster_dict, num_samples=20)

Cluster Analysis Results:

Cluster 1 (Total words: 6671):
--------------------------------------------------
1. is measured and | 2. Rigid Motion | 3. tilings | 4. , compared with
5. Colon | 6. Users Access to | 7. problem should be solved between the appearance and | 8. Objects with | 9. , two
10. x-y | 11. a conclusion and an | 12. the ýidealý generatrix derived from the | 13. collected over approximately 8 | 14. and intensity
15. closes with a list of some remaining | 16. -Shaped | 17. Situationsmodellierung in der Bildfolgeauswertung | 18. combination image", which relies on those edges of the | 19. Interactivity
20. " in the | 


Cluster 2 (Total words: 6823):
--------------------------------------------------
1. relationale Datenbanken | 2. images measured with | 3. Method: Use of Motion in | 4. Preparedness
5. Spatiotemporal | 6. even after the shape had suffered various modifications. Two | 7. Supporting Tools | 8. Invariant | 9. preserves the boundaries while inner regions are