## 1. Creating Embeddings

In [None]:
!pip install -U sentence-transformers

In [None]:
import csv
import seaborn as sns
from scipy import spatial
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer

# Read attendees and their responses from a CSV file, replace attendees.csv with own link or file name
attendees_map = {}
with open('attendees.csv', newline='') as csvfile:
    attendees = csv.reader(csvfile, delimiter=',', quotechar='"')
    next(attendees)  # Skip the header row
    for row in attendees:
        name, paragraph = row
        attendees_map[paragraph] = name
        
# Generate sentence embeddings
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
paragraphs = list(attendees_map.keys())
embeddings = model.encode(paragraphs)
    
# Create a dictionary to store embeddings for each person
person_embeddings = {attendees_map[paragraph]: embedding for paragraph, embedding in zip(paragraphs, embeddings)}

## 2. Creating Visualization

In [None]:
# Reducing dimensionality of embedding data, scaling to coordinate domain/range
reducer = umap.UMAP()
scaler = StandardScaler()
scaled_data = scaler.fit_transform(list(person_embeddings.values()))
reduced_data = reducer.fit_transform(scaled_data)

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans

# Step 1: Decide the number of clusters
# You can use a heuristic like the Elbow method to find an optimal k value
k = 5  # example value, adjust based on your data

# Step 2: Apply kNN Clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(reduced_data)
clusters = kmeans.labels_

In [None]:
# Creating lists of coordinates with accompanying labels
x = [row[0] for row in reduced_data]
y = [row[1] for row in reduced_data]
label = list(person_embeddings.keys())

# Plotting and annotating data points
custom_colors = ["#ff6d24", "#649aea", "#edb015", "#88dcbe", "#ff1457"]
# Map the cluster labels to the colors
cluster_colors = [custom_colors[label] for label in clusters]

plt.scatter(
    x,
    y,
    s=20,
    c=cluster_colors,
)
for i, name in enumerate(label):
    plt.annotate(name, (x[i], y[i]), fontsize="2")

# Clean-up and Export
plt.axis("off")
plt.savefig("visualization.png", dpi=800)

## 3. [BONUS] Providing top matches

In [None]:
from collections import defaultdict

In [None]:
top_matches = {}
all_personal_pairs = defaultdict(list)
for person in attendees_map.values():
    for person1 in attendees_map.values():
        all_personal_pairs[person].append([spatial.distance.cosine(person_embeddings[person1], person_embeddings[person]), person1])

for person in attendees_map.values():
    top_matches[person] = sorted(all_personal_pairs[person], key=lambda x: x[1])

print(top_matches)
