In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

# Try to use GPU TSNE (cuML), fallback to CPU TSNE (openTSNE)
try:
    import cupy as cp
    from cuml.manifold import TSNE as cuTSNE
    gpu_available = True
    print("✅ Using GPU cuML TSNE")
except ImportError:
    from openTSNE import TSNE as cpuTSNE
    gpu_available = False
    print("⚠️ cuML not available, falling back to CPU openTSNE")

# Set your directory
tsne_dir = '/c2/jinakim/Drug_Discovery_j/analysis/tsne_last/base_cX_mX_/'

# List all .npz files
files = sorted([f for f in os.listdir(tsne_dir) if f.endswith('.npz')])

# Process each file
for f in files:
    file_path = os.path.join(tsne_dir, f)
    save_name = f"{f.replace('.npz', '')}_tsne_sampled512.png"
    save_path = os.path.join(tsne_dir, save_name)

    # --- Skip if already exists ---
    if os.path.exists(save_path):
        print(f"⏩ {save_name} already exists. Skipping.")
        continue

    print(f"\n🚀 Processing {file_path}")

    # Load the .npz file
    data = np.load(file_path)
    embeddings = data['embeddings']
    labels = data['labels']

    print(f"✅ Loaded embeddings: {embeddings.shape}, labels: {labels.shape}")

    # --- Randomly sample 512 points per label ---
    num_samples_per_class = 512
    sampled_embeddings = []
    sampled_labels = []

    for label_value in np.unique(labels):
        indices = np.where(labels == label_value)[0]
        if len(indices) < num_samples_per_class:
            print(f"⚠️ Warning: label {label_value} has only {len(indices)} samples, using all available.")
            chosen_indices = np.random.choice(indices, len(indices), replace=False)
        else:
            chosen_indices = np.random.choice(indices, num_samples_per_class, replace=False)

        sampled_embeddings.append(embeddings[chosen_indices])
        sampled_labels.append(labels[chosen_indices])

    # Concatenate sampled data
    embeddings = np.concatenate(sampled_embeddings, axis=0)
    labels = np.concatenate(sampled_labels, axis=0)

    print(f"✅ Sampled embeddings: {embeddings.shape}, labels: {labels.shape}")

    # --- Run t-SNE ---
    if gpu_available:
        embeddings_gpu = cp.asarray(embeddings)
        tsne = cuTSNE(n_components=2, random_state=42)
        embeddings_2d_gpu = tsne.fit_transform(embeddings_gpu)
        embeddings_2d = cp.asnumpy(embeddings_2d_gpu)
    else:
        tsne = cpuTSNE(n_components=2, n_jobs=8, random_state=42)
        embeddings_2d = tsne.fit(embeddings)

    # --- Plot ---
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, cmap='tab10', s=5)
    plt.title(f"t-SNE of {f.replace('.npz', '')} (Sampled 512/class)")
    plt.colorbar(scatter, ticks=np.unique(labels), label='Label')
    plt.grid(True)
    plt.tight_layout()

    # Save figure
    plt.savefig(save_path)
    print(f"✅ Saved plot to {save_path}")
    plt.close()

print("\n🏁 All done!")



⚠️ cuML not available, falling back to CPU openTSNE

🚀 Processing /c2/jinakim/Drug_Discovery_j/analysis/tsne_last/base_cX_mX_/dsets_dpp4_count.npz
✅ Loaded embeddings: (19264, 64), labels: (19264,)
✅ Sampled embeddings: (1536, 64), labels: (1536,)
✅ Saved plot to /c2/jinakim/Drug_Discovery_j/analysis/tsne_last/base_cX_mX_/dsets_dpp4_count_tsne_sampled512.png

🚀 Processing /c2/jinakim/Drug_Discovery_j/analysis/tsne_last/base_cX_mX_/dsets_hivprot_bit.npz
✅ Loaded embeddings: (19264, 64), labels: (19264,)
✅ Sampled embeddings: (1536, 64), labels: (1536,)
✅ Saved plot to /c2/jinakim/Drug_Discovery_j/analysis/tsne_last/base_cX_mX_/dsets_hivprot_bit_tsne_sampled512.png

🚀 Processing /c2/jinakim/Drug_Discovery_j/analysis/tsne_last/base_cX_mX_/dsets_hivprot_count.npz
✅ Loaded embeddings: (19264, 64), labels: (19264,)
✅ Sampled embeddings: (1536, 64), labels: (1536,)
✅ Saved plot to /c2/jinakim/Drug_Discovery_j/analysis/tsne_last/base_cX_mX_/dsets_hivprot_count_tsne_sampled512.png

🚀 Processing

In [4]:
import os
import numpy as np
import matplotlib.pyplot as plt

# Try to use GPU TSNE (cuML), fallback to CPU TSNE (openTSNE)
try:
    import cupy as cp
    from cuml.manifold import TSNE as cuTSNE
    gpu_available = True
    print("✅ Using GPU cuML TSNE")
except ImportError:
    from openTSNE import TSNE as cpuTSNE
    gpu_available = False
    print("⚠️ cuML not available, falling back to CPU openTSNE")

# Set your directory
tsne_dir = '/c2/jinakim/Drug_Discovery_j/analysis/tsne/base_cX_mO_/'

# List all .npz files
files = sorted([f for f in os.listdir(tsne_dir) if f.endswith('.npz')])

# Process each file
for f in files:
    file_path = os.path.join(tsne_dir, f)
    print(f"\n🚀 Processing {file_path}")

    # Load the .npz file
    data = np.load(file_path)
    embeddings = data['embeddings']
    labels = data['labels']

    print(f"✅ Loaded embeddings: {embeddings.shape}, labels: {labels.shape}")

    # --- Randomly sample 512 points per label ---
    num_samples_per_class = 512
    sampled_embeddings = []
    sampled_labels = []

    for label_value in np.unique(labels):
        # if label_value == 0:
        #     continue
        indices = np.where(labels == label_value)[0]
        if len(indices) < num_samples_per_class:
            print(f"⚠️ Warning: label {label_value} has only {len(indices)} samples, using all available.")
            chosen_indices = np.random.choice(indices, len(indices), replace=False)
        else:
            chosen_indices = np.random.choice(indices, num_samples_per_class, replace=False)

        sampled_embeddings.append(embeddings[chosen_indices])
        sampled_labels.append(labels[chosen_indices])

    # Concatenate sampled data
    embeddings = np.concatenate(sampled_embeddings, axis=0)
    labels = np.concatenate(sampled_labels, axis=0)

    print(f"✅ Sampled embeddings: {embeddings.shape}, labels: {labels.shape}")

    # --- Run t-SNE ---
    if gpu_available:
        embeddings_gpu = cp.asarray(embeddings)
        tsne = cuTSNE(n_components=2, random_state=42)
        embeddings_2d_gpu = tsne.fit_transform(embeddings_gpu)
        embeddings_2d = cp.asnumpy(embeddings_2d_gpu)
    else:
        tsne = cpuTSNE(n_components=2, n_jobs=8, random_state=42)
        embeddings_2d = tsne.fit(embeddings)

    # --- Plot ---
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, cmap='tab10', s=5)
    plt.title(f"t-SNE of {f.replace('.npz', '')} (Sampled 512/class)")
    plt.colorbar(scatter, ticks=np.unique(labels), label='Label')
    plt.grid(True)
    plt.tight_layout()

    # Save figure
    save_name = f"{f.replace('.npz', '')}_tsne_sampled512.png"
    save_path = os.path.join(tsne_dir, save_name)
    plt.savefig(save_path)
    print(f"✅ Saved plot to {save_path}")
    plt.close()


⚠️ cuML not available, falling back to CPU openTSNE

🚀 Processing /c2/jinakim/Drug_Discovery_j/analysis/tsne/base_cX_mO_/dsets_dpp4_bit.npz
✅ Loaded embeddings: (25472, 512), labels: (25472,)
✅ Sampled embeddings: (1600, 512), labels: (1600,)
✅ Saved plot to /c2/jinakim/Drug_Discovery_j/analysis/tsne/base_cX_mO_/dsets_dpp4_bit_tsne_sampled512_2.png

🚀 Processing /c2/jinakim/Drug_Discovery_j/analysis/tsne/base_cX_mO_/dsets_dpp4_count.npz
✅ Loaded embeddings: (25472, 512), labels: (25472,)
✅ Sampled embeddings: (1600, 512), labels: (1600,)
✅ Saved plot to /c2/jinakim/Drug_Discovery_j/analysis/tsne/base_cX_mO_/dsets_dpp4_count_tsne_sampled512_2.png

🚀 Processing /c2/jinakim/Drug_Discovery_j/analysis/tsne/base_cX_mO_/dsets_hivprot_bit.npz
✅ Loaded embeddings: (22528, 512), labels: (22528,)
✅ Sampled embeddings: (1600, 512), labels: (1600,)
✅ Saved plot to /c2/jinakim/Drug_Discovery_j/analysis/tsne/base_cX_mO_/dsets_hivprot_bit_tsne_sampled512_2.png

🚀 Processing /c2/jinakim/Drug_Discovery_

KeyboardInterrupt: 