# Project data into 2D with t-SNE

In [5]:
%pip install plotly scikit-learn

## Scatter Plot

In [6]:
from sklearn.manifold import TSNE
import plotly.express as px

df = px.data.iris()

features = df.loc[:, :'petal_width']

tsne = TSNE(n_components=2, random_state=0)
projections = tsne.fit_transform(features)

fig = px.scatter(
    projections, x=0, y=1,
    color=df.species, labels={'color': 'species'}
)
fig.show()

## Load embeddings

In [36]:
import json
import pandas as pd

with open('embeddings.json', 'r') as f:
    data = json.load(f)


df = pd.DataFrame(data)

# truncate file_id to last 2 parts
df['file_id'] = df['file_id'].apply(lambda x: '/'.join(x.split('/')[-2:]))
df

Unnamed: 0,id,file_id,content,start_pos,end_pos,embedding
0,awesome-llm-apps/README.md_0-521,awesome-llm-apps/README.md,"<p align=""center"">\n <a href=""http://www.theu...",0,521,"[0.003139447, -0.021057991, -0.043931056, -0.0..."
1,awesome-llm-apps/README.md_400-960,awesome-llm-apps/README.md,<hr/>\n\n# 🌟 Awesome LLM Apps\n\nA curated col...,400,960,"[0.0010421905, -0.045103997, -0.030151501, -0...."
2,awesome-llm-apps/README.md_800-1242,awesome-llm-apps/README.md,## 🤔 Why Awesome LLM Apps?\n\n- 💡 Discover pra...,800,1242,"[0.022417033, -0.018805446, -0.004326907, -0.0..."
3,awesome-llm-apps/README.md_1200-1684,awesome-llm-apps/README.md,## 🚨 Open Source AI Agent Hackathon\n\nWe're l...,1200,1684,"[0.034729738, -0.021361602, 0.0054266984, -0.0..."
4,awesome-llm-apps/README.md_1600-1887,awesome-llm-apps/README.md,### Participate Now: [Global AI Agent Hackatho...,1600,1887,"[0.016929517, 0.012580578, 0.025718478, -0.022..."
...,...,...,...,...,...,...
408,awesome-llm-apps/advanced_ai_agents/multi_agen...,ai_teaching_agent_team/README.md,#### 📚 Research Librarian Agent\n- Compiles re...,800,1244,"[-0.005510044, -0.0033797089, -0.036461193, 0...."
409,awesome-llm-apps/advanced_ai_agents/multi_agen...,ai_teaching_agent_team/README.md,## How to Run\n\n1. Clone the repository\n ``...,1200,1649,"[0.033764236, -0.017298423, -0.014950359, -0.0..."
410,awesome-llm-apps/advanced_ai_agents/multi_agen...,ai_teaching_agent_team/README.md,2. Get your Composio API Key\n- Create an acco...,1600,1969,"[-0.012881822, 0.019202609, -0.05184871, -0.03..."
411,awesome-llm-apps/advanced_ai_agents/multi_agen...,ai_teaching_agent_team/README.md,- composio add googledocs (IN THE TERMINAL)\n ...,2000,2358,"[0.002119459, 0.0063326885, -0.04101504, -0.02..."


## t-SNE 

In [37]:
import numpy as np

perplexity    = 30
n_iter        = 300
learning_rate = 200
random_state  = 42

embeddings_matrix = np.array(df['embedding'].tolist())

# perplexity must be less than n_samples
if embeddings_matrix.shape[0] <= perplexity:
    print(f"Perplexity ({perplexity}) is too high for the number of samples ({embeddings_matrix.shape[0]}). Setting perplexity to {max(1, embeddings_matrix.shape[0] - 1)}.")
    perplexity = max(1, embeddings_matrix.shape[0] - 1)


tsne = TSNE(
    n_components=2,
    perplexity=perplexity,
    max_iter=n_iter,
    learning_rate=learning_rate,
    random_state=random_state,
    init='pca', # Use PCA initialization for stability
    n_jobs=-1 # Use all available CPU cores
)
projections = tsne.fit_transform(embeddings_matrix)
projections[:5]

array([[-7.8425083, 14.924624 ],
       [-7.489703 , 14.251944 ],
       [-7.0891323, 13.717144 ],
       [-6.7778482, 13.311465 ],
       [-6.4205666, 12.866445 ]], dtype=float32)

In [38]:
fig = px.scatter(
    projections, x=0, y=1,
    color=df.file_id,
    labels={'color': 'file_id'}
)
fig.show()