In [1]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.io as pio
import pandas as pd
import numpy as np

COLORS = [
    "#2f4f4f",
    "#a0522d",
    "#006400",
    "#000080",
    "#ff0000",
    "#00ced1",
    "#ffa500",
    "#ffff00",
    "#00ff00",
    "#00fa9a",
    "#0000ff",
    "#ff00ff",
    "#1e90ff",
    "#eee8aa",
    "#000000",
]

pio.templates[pio.templates.default].layout.colorway = COLORS

In [2]:
df = pd.read_excel("../data/metadata.xlsx")
core_pubs = pd.read_excel("../data/core_publications.xlsx")

In [3]:
documents = []
# id, title and abstract as meta data
for i, row in df.iterrows():
    pub_id = row["id"]
    group = core_pubs[core_pubs["Core Publications"] == pub_id]
    if group.empty:
        group = "Survey"
    else:
        group = group["Group"].values[0]
    content = f'Title: {row["title"]}\nAbstract: {row["abstract"]}'
    doc = Document(page_content=content, metadata={"id": row["id"], "group": group})
    documents.append(doc)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")


def create_embeddings():
    vector_store = FAISS.from_documents(documents, embedding_model)
    vector_store.save_local("../data/vector_store")


vector_store = FAISS.load_local(
    "../data/vector_store", embedding_model, allow_dangerous_deserialization=True
)

In [4]:
all_embeddings = np.array(
    [vector_store.index.reconstruct(i) for i in range(vector_store.index.ntotal)]
)
labels = [doc.metadata["group"] for doc in documents]
pub_ids = [doc.metadata["id"] for doc in documents]

n_clusters = 15  # 15 topics
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(all_embeddings)
cluster_labels = kmeans.labels_

tsne = TSNE(n_components=2, random_state=0)
tsne_embeddings = tsne.fit_transform(all_embeddings)

df = pd.DataFrame(tsne_embeddings, columns=["x", "y"])
df["label"] = labels
df["cluster"] = cluster_labels
df["pub_id"] = pub_ids

# Add true label to survey
df["survey_group"] = df.apply(
    lambda x: f"{x['label']} - {core_pubs[core_pubs['Survey'] == x['pub_id']]['Group'].values[0]}"
    if x["label"] == "Survey"
    else "",
    axis=1,
)

In [5]:
fig = px.scatter(
    df,
    x="x",
    y="y",
    color="label",
    hover_data=["label", "cluster", "pub_id", "survey_group"],
)
fig.update_traces(marker_opacity=0.9)
fig.show()

In [27]:
import umap

umap_embeddings = umap.UMAP(n_neighbors=20, min_dist=0.2, metric="cosine").fit_transform(
    all_embeddings
)

In [28]:

df = pd.DataFrame(umap_embeddings, columns=["x", "y"])
df["label"] = labels
df["cluster"] = cluster_labels
df["pub_id"] = pub_ids
# Add true label to survey
df["survey_group"] = df.apply(
    lambda x: f"{x['label']} - {core_pubs[core_pubs['Survey'] == x['pub_id']]['Group'].values[0]}"
    if x["label"] == "Survey"
    else "",
    axis=1,
)

fig = px.scatter(
    df,
    x="x",
    y="y",
    color="label",
    hover_data=["label", "cluster", "pub_id", "survey_group"],
)
fig.update_traces(marker_opacity=0.9)
fig.show()