In [1]:
import pandas as pd
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv, find_dotenv
import numpy as np
import time
from tqdm import tqdm

In [2]:
dataset_path = os.path.join(Path.cwd().parent, 'data', 'stackexchange_dataset.csv')
data = pd.read_csv(dataset_path,index_col='question_id')

In [3]:
data = data[~data.index.duplicated()]

In [6]:
data.head(4)

Unnamed: 0_level_0,title,has_accepted_answer,accepted_answer_score,time_to_accepted_answer_hours,question_score,question_text,num_tags,tags,accepted_answer_id,accepted_answer_length_chars,accepted_answer_length_tokens
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
79802517,Looking for a better way using &quot;.Include&...,False,,,2,I am looking for a better way to use the .Incl...,2,"['c#', 'entity-framework']",,,
79802934,NTP is moving my clock further from the correc...,False,,,0,Windows 11 Pro 10.0.26200 Build 26200. Dell XP...,1,['ntp'],,,
79802909,Execution of pandas&#39; info in python,False,,,0,I am new to pandas library in python. When I l...,2,"['python', 'pandas']",,,
79802932,How to debug MongoDB recurring error 314 Objec...,False,,,0,On a fresh Mongo DB 8.0 mono instance (no clus...,4,"['database', 'mongodb', 'nosql', 'system-admin...",,,


In [4]:
data_for_embedding = data[['title','question_text']]

In [5]:
data_for_embedding

Unnamed: 0_level_0,title,question_text
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1
79802517,Looking for a better way using &quot;.Include&...,I am looking for a better way to use the .Incl...
79802934,NTP is moving my clock further from the correc...,Windows 11 Pro 10.0.26200 Build 26200. Dell XP...
79802909,Execution of pandas&#39; info in python,I am new to pandas library in python. When I l...
79802932,How to debug MongoDB recurring error 314 Objec...,On a fresh Mongo DB 8.0 mono instance (no clus...
79802919,How to integrate QML UI into a custom Vulkan r...,I'm developing a custom Vulkan renderer and wa...
...,...,...
79662126,Is there a built-in identity function in JavaS...,Is there a function in JavaScript that accepts...
79662912,How can I improve the accuracy of my Transform...,I'm training a Transformer-based text classifi...
79663172,How to get a clickable PDF in HTML,To have a picture (eg. in jpg format) on a web...
79659655,Unable to start debugging. Unexpected GDB outp...,"On a pristine new Windows 11, installed Visual..."


---

In [58]:
import ollama
BATCH_SIZE = 100
model = 'qwen3-embedding'

In [59]:
texts_to_embed = data_for_embedding['title'].tolist() + data_for_embedding['question_text'].tolist()
print(f"Total texts to embed: {len(texts_to_embed)}")

Total texts to embed: 199984


In [60]:
all_embeddings = []

In [62]:
for i in tqdm(range(0, len(texts_to_embed), BATCH_SIZE)):
    batch_texts = texts_to_embed[i:i + BATCH_SIZE]
    result = ollama.embed(
            model=model,
            input=batch_texts
        )
    batch_embeddings = [np.array(e) for e in result['embeddings']]
    all_embeddings.extend(batch_embeddings)

100%|██████████| 2000/2000 [6:06:48<00:00, 11.00s/it]  


In [63]:
num_rows = len(data_for_embedding)

In [64]:
title_embeddings = all_embeddings[0:num_rows]
question_text_embeddings = all_embeddings[num_rows:]

In [65]:
df_embeddings = pd.DataFrame(
    {
        'title_embedding': title_embeddings,
        'question_text_embedding': question_text_embeddings
    },
    index=data_for_embedding.index
)

In [66]:
df_embeddings.head(4)

Unnamed: 0_level_0,title_embedding,question_text_embedding
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1
79802517,"[0.008553513, -0.009437113, 0.009673767, -0.02...","[0.0013518566, -0.015674047, -0.004076924, -0...."
79802934,"[0.019479005, 0.007850029, -0.020600174, -0.02...","[0.009473968, -0.014583107, -0.019139914, 0.00..."
79802909,"[-0.01977205, 0.0042978777, -0.03637585, -0.01...","[-0.0019663926, -0.024625326, -0.027527379, -0..."
79802932,"[0.020906445, -0.010505062, 0.0033248097, -0.0...","[0.0403566, -0.016648613, 0.0031766212, -0.001..."


In [68]:
df_embeddings.to_pickle(os.path.join(Path.cwd().parent, 'data', 'stackexchange_embeddings.pkl'))

In [77]:
df_embeddings.head(4)

Unnamed: 0_level_0,title_embedding,question_text_embedding
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1
79802517,"[0.008553513, -0.009437113, 0.009673767, -0.02...","[0.0013518566, -0.015674047, -0.004076924, -0...."
79802934,"[0.019479005, 0.007850029, -0.020600174, -0.02...","[0.009473968, -0.014583107, -0.019139914, 0.00..."
79802909,"[-0.01977205, 0.0042978777, -0.03637585, -0.01...","[-0.0019663926, -0.024625326, -0.027527379, -0..."
79802932,"[0.020906445, -0.010505062, 0.0033248097, -0.0...","[0.0403566, -0.016648613, 0.0031766212, -0.001..."


In [116]:
df_embeddings.iloc[0]['title_embedding'].dtype

dtype('float64')

In [87]:
df_embeddings.to_parquet(os.path.join(Path.cwd().parent, 'data', 'stackexchange_embeddings.parquet'), engine='fastparquet', object_encoding='json')

ValueError: Error converting column "title_embedding" to bytes using encoding JSON. Original error: Object of type ndarray is not JSON serializable

In [None]:
df_embeddings.to_csv(os.path.join(Path.cwd().parent, 'data', 'stackexchange_embeddings.csv'))

In [88]:
title_embeddings_array = np.stack(df_embeddings['title_embedding'].values)
question_embeddings_array = np.stack(df_embeddings['question_text_embedding'].values)

In [89]:
title_embeddings_array.shape, question_embeddings_array.shape

((99992, 4096), (99992, 4096))

In [90]:
output_dir = os.path.join(Path.cwd().parent, 'data')
title_path = os.path.join(output_dir, 'title_embeddings.npy')
question_path = os.path.join(output_dir, 'question_embeddings.npy')
np.save(title_path, title_embeddings_array)
np.save(question_path, question_embeddings_array)

In [103]:
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score
from sklearn.decomposition import PCA

In [104]:
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(title_embeddings_array)

In [None]:
K = 2000
kmeans = KMeans(n_clusters=K, random_state=42, n_init='auto')
cluster_labels = kmeans.fit_predict(X_pca)

In [106]:
ch_score = calinski_harabasz_score(X_pca, cluster_labels)

print(f"Calinski-Harabasz Index (K={K}): {ch_score:.2f}")

Calinski-Harabasz Index (K=2000): 86.23


In [107]:
K = 10
kmeans = KMeans(n_clusters=K, random_state=42, n_init='auto')
cluster_labels = kmeans.fit_predict(X_pca)
ch_score = calinski_harabasz_score(X_pca, cluster_labels)

print(f"Calinski-Harabasz Index (K={K}): {ch_score:.2f}")

Calinski-Harabasz Index (K=10): 2949.47


In [112]:
K = 2137
kmeans = KMeans(n_clusters=K, random_state=42, n_init='auto')
cluster_labels = kmeans.fit_predict(X_pca)
ch_score = calinski_harabasz_score(X_pca, cluster_labels)

print(f"Calinski-Harabasz Index (K={K}): {ch_score:.2f}")

Calinski-Harabasz Index (K=2137): 82.12


In [113]:
centered_data = title_embeddings_array - np.mean(title_embeddings_array, axis=0)

In [114]:
total_variance = np.trace(np.cov(centered_data, rowvar=False))

In [115]:
total_variance

np.float64(0.6058743982434113)