In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# === Load data ===
df_clusters = pd.read_csv("kproto_assignments.csv")  # contains subj_num, kproto_cluster
df_sharing = pd.read_csv("preprocessed_data.csv")    # contains subj_num and sharing columns

# === Merge Block 0 sharing with cluster labels ===
df_train = df_clusters.merge(
    df_sharing[["subj_num", "mean_share_friend_0", "mean_share_stranger_0"]],
    on="subj_num"
)

# === Training data ===
X_train = df_train[["mean_share_friend_0", "mean_share_stranger_0"]]
y_train = df_train["kproto_cluster"]

# === Scale features ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# === Train KNN model ===
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

# === Predict cluster for Blocks 1–3 ===
X_blocks = {}
for i in [1, 2, 3]:
    cols = [f"mean_share_friend_{i}", f"mean_share_stranger_{i}"]
    df_block = df_sharing[["subj_num"] + cols].dropna()
    X_scaled = scaler.transform(df_block[cols].values) 
    y_pred = knn.predict(X_scaled)
    X_blocks[i] = pd.DataFrame({
        "subj_num": df_block["subj_num"],
        f"pred_cluster_block{i}": y_pred
    })


# === Merge predictions into one DataFrame ===
df_combined = df_clusters.copy()
for i in [1, 2, 3]:
    df_combined = df_combined.merge(X_blocks[i], on="subj_num", how="left")

# === Save results ===
df_combined.to_csv("knn_cluster_tracking.csv", index=False)
print("✅ Saved as 'knn_cluster_tracking.csv'")


In [None]:
import matplotlib.pyplot as plt

# Visualize cluster assignments in Block 0 (training data)
plt.figure(figsize=(7, 5))
scatter = plt.scatter(
    X_train["mean_share_friend_0"],
    X_train["mean_share_stranger_0"],
    c=y_train,
    cmap="tab10",
    edgecolor="k",
    alpha=0.8
)
plt.xlabel("mean_share_friend_0")
plt.ylabel("mean_share_stranger_0")
plt.title("Cluster assignments (Block 0)")
plt.legend(*scatter.legend_elements(), title="Cluster")
plt.tight_layout()
plt.show()

# Visualize predictions for Block 1
plt.figure(figsize=(7, 5))

df_block1 = X_blocks[1].merge(
    df_sharing[["subj_num", "mean_share_friend_1", "mean_share_stranger_1"]],
    on="subj_num"
)
scatter = plt.scatter(
    df_block1["mean_share_friend_1"],
    df_block1["mean_share_stranger_1"],
    c=df_block1["pred_cluster_block1"],
    cmap="tab10",
    edgecolor="k",
    alpha=0.8
)
plt.xlabel("mean_share_friend_1")
plt.ylabel("mean_share_stranger_1")
plt.title("Predicted clusters (Block 1)")
plt.legend(*scatter.legend_elements(), title="Predicted Cluster")
plt.tight_layout()
plt.show()

# Visualize predictions for Block 2
plt.figure(figsize=(7, 5))
# Merge X_blocks[2] with df_sharing to get the plotting columns
df_block2 = X_blocks[2].merge(
    df_sharing[["subj_num", "mean_share_friend_2", "mean_share_stranger_2"]],
    on="subj_num"
)
scatter = plt.scatter(
    df_block2["mean_share_friend_2"],
    df_block2["mean_share_stranger_2"],
    c=df_block2["pred_cluster_block2"],
    cmap="tab10",
    edgecolor="k",
    alpha=0.8
)
plt.xlabel("mean_share_friend_2")
plt.ylabel("mean_share_stranger_2")
plt.title("Predicted clusters (Block 2)")
plt.legend(*scatter.legend_elements(), title="Predicted Cluster")
plt.tight_layout()
plt.show()

# Visualize predictions for Block 3
plt.figure(figsize=(7, 5))
# Merge X_blocks[3] with df_sharing to get the plotting columns
df_block3 = X_blocks[3].merge(
    df_sharing[["subj_num", "mean_share_friend_3", "mean_share_stranger_3"]],
    on="subj_num"
)
scatter = plt.scatter(
    df_block3["mean_share_friend_3"],
    df_block3["mean_share_stranger_3"],
    c=df_block3["pred_cluster_block3"],
    cmap="tab10",
    edgecolor="k",
    alpha=0.8
)
plt.xlabel("mean_share_friend_3")
plt.ylabel("mean_share_stranger_3")
plt.title("Predicted clusters (Block 3)")
plt.legend(*scatter.legend_elements(), title="Predicted Cluster")
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

# Heatmaps: showing transitions (e.g., from Cluster 0 → 1)
for i in [1, 2, 3]:
    true_col = "kproto_cluster"
    pred_col = f"pred_cluster_block{i}"
    # Only subjects with ground truth
    merged = df_clusters.merge(X_blocks[i], on="subj_num", how="inner")
    confusion = pd.crosstab(merged[true_col], merged[pred_col])
    plt.figure(figsize=(5, 4))
    sns.heatmap(confusion, annot=True, fmt="d", cmap="Blues")
    plt.xlabel(f"Predicted Cluster (Block {i})")
    plt.ylabel("True Cluster (Block 0)")
    plt.title(f"Transition Heatmap: Block 0 → Block {i}")
    plt.tight_layout()
    plt.show()