In [None]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Load pickle data
df = pandas.read_pickle('/workplace')

# The data sample is labeled if the variables Morphology, Ecology & Behavior, Geography, People, Modern & Past Culture, and Other contain a value of 1.
df['label'] = df[['Morphology', 'Ecology & Behavior', 'Geography', 'People', 'Modern & Past Culture', 'Other']].idxmax(axis=1)

Verify the appropriate number of classes to divide into

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np

# Retrieve the embedding array
embedding_array = df['embedding'].tolist()  # Assumes df is defined in advance

# Range of cluster numbers to try
cluster_range = range(6, 12)
silhouette_scores = []

# Compute the silhouette score for each number of clusters
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(embedding_array)
    score = silhouette_score(embedding_array, labels)
    silhouette_scores.append(score)

# Determine the optimal number of clusters
best_k = cluster_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters (max Silhouette Score): {best_k}")

# Visualize the scores
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.title("Optimal Number of Clusters by Silhouette Score")
plt.grid(True)
plt.tight_layout()
plt.show()

# Apply clustering (n=7)

In [None]:
from sklearn.cluster import KMeans

# Extract embeddings and convert them to a NumPy-compatible array
embedding_array = df['embedding'].tolist()

# Temporarily set the number of clusters to 7 (can be changed)
n_clusters = 7
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embedding_array)

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances

# Retrieve cluster centroids
centroids = kmeans.cluster_centers_

# Add cluster labels to the DataFrame
df['cluster'] = cluster_labels
embedding_matrix = np.array(embedding_array)

# Extract the 50 samples closest to each centroid
central_indices = []

for cluster_id in range(n_clusters):
    # Extract indices belonging to the current cluster
    indices_in_cluster = np.where(cluster_labels == cluster_id)[0]
    cluster_embeddings = embedding_matrix[indices_in_cluster]

    # Compute distances to the cluster centroid and sort by proximity
    distances = pairwise_distances(cluster_embeddings, [centroids[cluster_id]])
    top_k = indices_in_cluster[np.argsort(distances.ravel())[:50]]

    central_indices.extend(top_k)

# Extract and save the final DataFrame
df_central = df.iloc[central_indices].reset_index(drop=True)

# Applying ICA (3D)

In [None]:
from sklearn.decomposition import FastICA
import pandas as pd
import plotly.express as px

# Apply ICA (3-dimensional)
ica = FastICA(n_components=3, random_state=42)
ica_result = ica.fit_transform(embedding_array)

# Combine ICA results and cluster labels into a DataFrame
ica_df = pd.DataFrame(ica_result, columns=['ICA1', 'ICA2', 'ICA3'])
ica_df['cluster'] = cluster_labels.astype(str)  # Convert to string for color grouping

# Interactive 3D plot using Plotly
fig = px.scatter_3d(
    ica_df,
    x='ICA1', y='ICA2', z='ICA3',
    color='cluster',
    title='ICA 3D Projection of Embeddings (Colored by Cluster)',
    opacity=0.7
)
fig.update_traces(marker=dict(size=3))
fig.show()

# Applying t-sne

In [None]:
from sklearn.manifold import TSNE
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Apply t-SNE (2D) to the ICA results (3D)
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_result = tsne.fit_transform(ica_df[['ICA1', 'ICA2', 'ICA3']])

# Combine the results into a DataFrame
tsne_df = pd.DataFrame(tsne_result, columns=['TSNE1', 'TSNE2'])
tsne_df['cluster'] = ica_df['cluster']

# Visualization (color-coded by cluster using Seaborn)
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=tsne_df,
    x='TSNE1', y='TSNE2',
    hue='cluster',
    palette='Set2',
    s=30,
    alpha=0.8
)
plt.title("t-SNE Projection after ICA (Colored by Cluster)")
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()