In [1]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_chroma import Chroma
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.io as pio
import pandas as pd
import numpy as np

COLORS = [
    "#2f4f4f",
    "#a0522d",
    "#006400",
    "#000080",
    "#ff0000",
    "#00ced1",
    "#ffa500",
    "#ffff00",
    "#00ff00",
    "#00fa9a",
    "#0000ff",
    "#ff00ff",
    "#1e90ff",
    "#eee8aa",
    "#000000",
]

pio.templates[pio.templates.default].layout.colorway = COLORS

In [2]:
df = pd.read_excel("./data/metadata.xlsx")
core_pubs = pd.read_excel("./data/core_publications.xlsx")

In [3]:
documents = []
pub_ids = []
# id, title and abstract as meta data
for i, row in df.iterrows():
    pub_id = row["id"]
    group = core_pubs[core_pubs["Pub_id"] == pub_id]
    if group.empty:
        group = "Survey"
    else:
        group = group["Topic"].values[0]
    content = f'Title: {row["title"]}\nAbstract: {row["abstract"]}'
    doc = Document(page_content=content, metadata={"id": row["id"], "topic": group})
    documents.append(doc)
    pub_ids.append(row["id"])

def create_embeddings(documents, ids):
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    vs = Chroma("core_publications", embedding_model, persist_directory="./data/vs/core_publications")
    vs.add_documents(documents, ids=ids)
    return vs
# create_embeddings(documents, pub_ids)

In [4]:
collection = Chroma("core_publications", persist_directory="./data/vs/core_publications")
pub_ids = collection.get()["ids"]
all_embeddings = []
labels = []
for i in pub_ids:
    all_embeddings.append(collection.get(i, include=["embeddings"])["embeddings"])
    labels.append(collection.get(i)["metadatas"][0]["topic"])

all_embeddings = np.squeeze(all_embeddings)
n_clusters = 15  # 15 topics
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(all_embeddings)
cluster_labels = kmeans.labels_

tsne = TSNE(n_components=2, random_state=0)
tsne_embeddings = tsne.fit_transform(all_embeddings)

df = pd.DataFrame(
    {
        "x": tsne_embeddings[:, 0],
        "y": tsne_embeddings[:, 1],
        "label": labels,
        "cluster": cluster_labels,
        "pub_id": pub_ids,
    }
)

# Add true label to survey
df["survey_group"] = df.apply(
    lambda x: f"{x['label']} - {core_pubs[core_pubs['Survey'] == x['pub_id']]['Topic'].values[0]}"
    if x["label"] == "Survey"
    else "",
    axis=1,
)

In [9]:
fig = px.scatter(
    df[df["label"] != "Survey"],
    x="x",
    y="y",
    color="label",
    hover_data=["label", "cluster", "pub_id", "survey_group"],
    labels={"label": "Topic"},
)
fig.update_traces(marker_opacity=0.9)
# set title, axis labels values t-SNE
# set title font size to 20 and x and y axis labels font size to 15 rename label to topic
fig.update_layout(
    title="Clustering of Core Publications",
    font_family="Modern Computer",
    title_x=0.5,
    xaxis_title="TSNE 1",
    yaxis_title="TSNE 2",
    title_font_size=20,
    xaxis_title_font_size=15,
    yaxis_title_font_size=15,
    legend_title="",  # remove legend title
    legend_font_size=15,
)
fig.add_trace(
    px.scatter(
        df[df["label"] == "Survey"],
        x="x",
        y="y",
        color_discrete_sequence=["#8B0000"],
    ).data[0]
)
fig.data[1].name = "Survey"
fig.update_layout(legend=dict(orientation="h", y=-0.2))
fig.show()#pdf
pio.write_image(fig, "core_publications_tsne.pdf")

In [10]:
import umap

umap_embeddings = umap.UMAP(n_neighbors=20, min_dist=0.2, metric="cosine").fit_transform(
    all_embeddings
)

In [11]:

df = pd.DataFrame(umap_embeddings, columns=["x", "y"])
df["label"] = labels
df["cluster"] = cluster_labels
df["pub_id"] = pub_ids
# Add true label to survey
df["survey_group"] = df.apply(
    lambda x: f"{x['label']} - {core_pubs[core_pubs['Survey'] == x['pub_id']]['Topic'].values[0]}"
    if x["label"] == "Survey"
    else "",
    axis=1,
)

fig = px.scatter(
    df,
    x="x",
    y="y",
    color="label",
    hover_data=["label", "cluster", "pub_id", "survey_group"],
)
fig.update_traces(marker_opacity=0.9)
fig.show()