In [None]:
# 需要在有显卡的本地环境运行

import numpy as np
import pandas as pd
import umap.umap_ as umap
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.cluster import KMeans

In [None]:
# 读取json文件
input_name = "stackoverflow-mysql.jsonl"
input_df = pd.read_json(input_name, lines=True)

In [None]:
# 打印列名称和类型
print(input_df.columns)
print(f"Title: {type(input_df['Title'][0])}")
print(f"Body: {type(input_df['Body'][0])}")
print(f"Tags: {type(input_df['Tags'][0])}")
print(f"Answers: {type(input_df['Answers'][0])}")

In [None]:
# 将Title, Body, Tags, Answers合并为Text列
input_df['Text'] = "Title: " + input_df['Title'] + " Body: " + input_df['Body'] + " Tags: " + input_df['Tags'] + " Answers: " + input_df['Answers'].apply(lambda x: " ".join([a["Body"] for a in x]))

In [None]:
# 从Hugging Face加载预训练模型
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# 将Text列转换为嵌入向量Embeddings
Embeddings = model.encode(input_df['Text'])
print(Embeddings.shape)

In [None]:
# 使用UMAP将嵌入向量Embeddings降维到2维
umap_model = umap.UMAP(n_components=2)
reduced_Embeddings = umap_model.fit_transform(Embeddings)

In [None]:
# 保存降维后的嵌入向量
np.save("data.npy", reduced_Embeddings)

In [None]:
# 随机绘制数据点
point_nums = 2048
data = np.load("data.npy")
fig, axs = plt.subplots(4, 4, figsize=(15, 12))
fig.suptitle(f'Plot Scatter Graphs (Random {point_nums} Points)', fontsize=16)

for i in range(4):
    for j in range(4):
        indices = np.random.choice(data.shape[0], point_nums, replace=False)
        axs[i, j].scatter(data[indices, 0], data[indices, 1], s=5)
        axs[i, j].set_title(f'Plot {i * 4 + j + 1}')

plt.tight_layout(rect=[0, 0, 1, 0.99])
plt.show()

In [None]:
# 绘制肘部图
SSE = []
data = np.load("data.npy")
k_range = range(2, 60)

for k in tqdm(k_range, desc="KMeans"):
    kmeans = KMeans(n_clusters=k, init='k-means++')
    kmeans.fit(data)
    SSE.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(k_range, SSE, marker="o")
plt.xlabel("Number of clusters (k)")
plt.ylabel("SSE")
plt.title("Elbow Method")
plt.axvline(x=16, ymax=0.13, color="orange", linestyle="--", label="Optimal k = 16")
plt.legend()
plt.tight_layout()
plt.show()