## Data loading

In [None]:
# Load a Parquet file
import pandas as pd

# Path to the Parquet file
path = "/User/path"

# Read the Parquet file
df = pd.read_parquet(path)

## Data preprocessing

In [None]:
# Required libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch

# ------------------------------
# 1. Load the model (prefer GPU if available)
# ------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

# ------------------------------
# 2. Load the Species column from df
# ------------------------------
# df = pd.read_parquet("path_to_parquet")  # ← your DataFrame

# Replace NaN with empty strings (to avoid errors)
texts = df["Species"].fillna("").astype(str).tolist()

# ------------------------------
# 3. Generate embeddings
# ------------------------------
embeddings = model.encode(
    texts,
    batch_size=128,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True   # Optional: normalize embeddings
)

# ------------------------------
# 4. Add embeddings to df
# ------------------------------
df["Species_emb"] = embeddings.tolist()

# Sanity check
print(df["Species_emb"].head())
print(df.shape)

In [None]:
import numpy as np
import pandas as pd

group_cols = [
    "LLM_Scientific_Abstract_Morphology",
    "LLM_Scientific_Specific_Morphology",
    "LLM_Scientific_Conceptual_Morphology",
    "LLM_Scientific_People_Male",
    "LLM_Scientific_People_Female",
    "LLM_Scientific_Geography",
    "LLM_Scientific_Other",
]

# ---- This is likely already executed, but run again just in case ----
df[group_cols] = df[group_cols].fillna(0).astype(int)
row_sum = df[group_cols].sum(axis=1)

# Use the leftmost 1 as the base Group
df["Group"] = df[group_cols].idxmax(axis=1)

# If all zeros (None), assign to Other
df.loc[row_sum == 0, "Group"] = "LLM_Scientific_Other"

# Mask for multi-label rows
multi_mask = row_sum > 1

# Temporarily set multi-label rows to NaN (handled separately later)
df.loc[multi_mask, "Group"] = np.nan

# Merge People_Male / People_Female into People (single-label only)
people_cols = [
    "LLM_Scientific_People_Male",
    "LLM_Scientific_People_Female",
]
df.loc[df["Group"].isin(people_cols), "Group"] = "LLM_Scientific_People"

print("Base Group counts (before 9-category mapping):")
print(df["Group"].value_counts(dropna=False))

# =====================================================
# 1. Map single-label rows to categories 1–6
# =====================================================

# Create the final label column (9 categories)
df["Group9"] = np.nan

# Only apply to single-label rows (i.e., not multi-label)
single_mask = ~multi_mask

single_map = {
    "LLM_Scientific_Specific_Morphology":      "1.LLM_Scientific_Specific_Morphology",
    "LLM_Scientific_People":                   "2.LLM_Scientific_People",
    "LLM_Scientific_Abstract_Morphology":      "3.LLM_Scientific_Abstract_Morphology",
    "LLM_Scientific_Geography":                "4.LLM_Scientific_Geography",
    "LLM_Scientific_Other":                    "5.LLM_Scientific_Other",
    "LLM_Scientific_Conceptual_Morphology":    "6.LLM_Scientific_Conceptual_Morphology",
}

# Convert Group to 1–6 labels for single-label rows
df.loc[single_mask, "Group9"] = df.loc[single_mask, "Group"].map(single_map)

# If single-label but Group9 is NaN (should be rare), assign to 5.Other
df["Group9"] = df["Group9"].fillna("5.LLM_Scientific_Other")

# =====================================================
# 2. Assign multi-label rows into categories 7/8/9
# =====================================================

# Extract only multi-label rows (0/1 columns)
df_multi = df.loc[multi_mask, group_cols].copy()

def multi_pattern_to_group9(row):
    cols = [c for c in group_cols if row[c] == 1]
    cols = sorted(cols)
    pat = " + ".join(cols)

    if pat == "LLM_Scientific_Abstract_Morphology + LLM_Scientific_Specific_Morphology":
        return "7.LLM_Scientific_Abstract_Morphology + LLM_Scientific_Specific_Morphology"
    elif pat == "LLM_Scientific_Abstract_Morphology + LLM_Scientific_Conceptual_Morphology":
        return "8.LLM_Scientific_Abstract_Morphology + LLM_Scientific_Conceptual_Morphology"
    else:
        return "9.Other_Multi"

# Assign 7/8/9 labels to each multi-label row
group9_multi = df_multi.apply(multi_pattern_to_group9, axis=1)

# Write back to the original df
df.loc[multi_mask, "Group9"] = group9_multi

# =====================================================
# 3. Verify results
# =====================================================
print("\nGrouped into 9 categories (Group9):")
print(df["Group9"].value_counts())

## Visualization with PCA

In [None]:
import os
import math
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401
from sklearn.decomposition import PCA
import pandas as pd

# ======================================
# 0. Parameters & output path
# ======================================
path = "/workspace/d9999993/生物学の研究/語源解析/結果"
os.makedirs(path, exist_ok=True)

# ======================================
# 1. Sampling & PCA (3D)
#    (If df_sample is already loaded from a parquet file,
#     this block can be skipped and df_sample can be loaded directly.)
# ======================================

# Group9 labels
labels_full = df["Group9"].astype(str).values
unique_groups = pd.Series(labels_full).unique()

print("Groups:", unique_groups)

# Sample up to 5000 points per Group9
max_per_group = 5000
rng = np.random.default_rng(42)
sample_idx = []

for g in unique_groups:
    idx = np.where(labels_full == g)[0]
    if len(idx) > max_per_group:
        idx = rng.choice(idx, max_per_group, replace=False)
    sample_idx.append(idx)

sample_idx = np.concatenate(sample_idx)
print("Total sampled points:", len(sample_idx))

# Create sampled DataFrame
df_sample = df.iloc[sample_idx].copy()

# Embedding subset
X = np.vstack(df_sample["Species_emb"].values)
print("Embedding sample shape:", X.shape)

# PCA (3 dimensions)
pca_3d = PCA(n_components=3, random_state=42)
X_pca3 = pca_3d.fit_transform(X)

df_sample["PCA3_x"] = X_pca3[:, 0]
df_sample["PCA3_y"] = X_pca3[:, 1]
df_sample["PCA3_z"] = X_pca3[:, 2]

# ======================================
# 2. Multi-view (static) 3D visualization settings
# ======================================

groups = df_sample["Group9"].unique()
cmap = plt.get_cmap("tab20", len(groups))

# View candidates (elev: elevation angle, azim: azimuth angle)
views_elev = [0, 15, 30, 45, 60, 75, 90]
views_azim = [0, 45, 90, 135, 180, 225, 270, 315]
views = [(e, a) for e in views_elev for a in views_azim]

print("Number of views:", len(views))   # 56
print("First 10 views:", views[:10])

# ======================================
# 3. Save one image per view
# ======================================

for i, (elev, azim) in enumerate(views, start=1):
    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(111, projection="3d")

    for j, g in enumerate(groups):
        sub = df_sample[df_sample["Group9"] == g]
        ax.scatter(
            sub["PCA3_x"],
            sub["PCA3_y"],
            sub["PCA3_z"],
            s=3,
            alpha=0.6,
            color=cmap(j),
        )

    ax.view_init(elev=elev, azim=azim)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_zticks([])
    ax.set_title(f"PCA3D e={elev}, a={azim}", fontsize=10)

    filename = f"PCA3D_view_{i:02d}_e{elev}_a{azim}.png"
    save_path_view = os.path.join(path, filename)
    plt.savefig(save_path_view, dpi=300, bbox_inches="tight")
    plt.close(fig)

    print(f"Saved view {i:02d}: {save_path_view}")

# ======================================
# 4. Save legend as a separate image
# ======================================

fig_legend = plt.figure(figsize=(10, 4))

handles = []
labels = []

for j, g in enumerate(groups):
    # Dummy scatter for legend (larger marker)
    h = plt.scatter([], [], color=cmap(j), s=200)
    handles.append(h)
    labels.append(g)

plt.legend(
    handles,
    labels,
    loc="center",
    fontsize=12,
    ncol=3,   # 3 columns (adjust as needed)
)
plt.axis("off")

legend_path = os.path.join(path, "PCA3D_Group9_legend.png")
plt.savefig(legend_path, dpi=300, bbox_inches="tight")
plt.close(fig_legend)

print(f"Saved legend figure to: {legend_path}")