In [8]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_chroma import Chroma
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.io as pio
import pandas as pd
import numpy as np

COLORS = [
    "#2f4f4f",
    "#a0522d",
    "#006400",
    "#000080",
    "#ff0000",
    "#00ced1",
    "#ffa500",
    "#ffff00",
    "#00ff00",
    "#00fa9a",
    "#0000ff",
    "#ff00ff",
    "#1e90ff",
    "#eee8aa",
    "#000000",
]

pio.templates[pio.templates.default].layout.colorway = COLORS

In [9]:
df = pd.read_excel("./data/metadata.xlsx")
core_pubs = pd.read_excel("./data/core_publications.xlsx")

In [14]:
documents = []
pub_ids = []
# id, title and abstract as meta data
for i, row in df.iterrows():
    pub_id = row["id"]
    group = core_pubs[core_pubs["Pub_id"] == pub_id]
    if group.empty:
        group = "Survey"
    else:
        group = group["Topic"].values[0]
    content = f'Title: {row["title"]}\nAbstract: {row["abstract"]}'
    doc = Document(page_content=content, metadata={"id": row["id"], "topic": group})
    documents.append(doc)
    pub_ids.append(row["id"])

def create_embeddings(documents, ids):
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    vs = Chroma("core_publications", embedding_model, persist_directory="./data/vs/core_publications")
    vs.add_documents(documents, ids=ids)
    return vs
# create_embeddings(documents, pub_ids)

<langchain_chroma.vectorstores.Chroma at 0x2b4255d7490>

In [18]:
collection = Chroma("core_publications", persist_directory="./data/vs/core_publications")
pub_ids = collection.get()["ids"]
all_embeddings = []
labels = []
for i in pub_ids:
    all_embeddings.append(collection.get(i, include=["embeddings"])["embeddings"])
    labels.append(collection.get(i)["metadatas"][0]["topic"])

all_embeddings = np.squeeze(all_embeddings)
n_clusters = 15  # 15 topics
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(all_embeddings)
cluster_labels = kmeans.labels_

tsne = TSNE(n_components=2, random_state=0)
tsne_embeddings = tsne.fit_transform(all_embeddings)

df = pd.DataFrame(
    {
        "x": tsne_embeddings[:, 0],
        "y": tsne_embeddings[:, 1],
        "label": labels,
        "cluster": cluster_labels,
        "pub_id": pub_ids,
    }
)

# Add true label to survey
df["survey_group"] = df.apply(
    lambda x: f"{x['label']} - {core_pubs[core_pubs['Survey'] == x['pub_id']]['Topic'].values[0]}"
    if x["label"] == "Survey"
    else "",
    axis=1,
)

In [19]:
fig = px.scatter(
    df,
    x="x",
    y="y",
    color="label",
    hover_data=["label", "cluster", "pub_id", "survey_group"],
)
fig.update_traces(marker_opacity=0.9)
fig.show()

In [None]:
import umap

umap_embeddings = umap.UMAP(n_neighbors=20, min_dist=0.2, metric="cosine").fit_transform(
    all_embeddings
)

In [None]:

df = pd.DataFrame(umap_embeddings, columns=["x", "y"])
df["label"] = labels
df["cluster"] = cluster_labels
df["pub_id"] = pub_ids
# Add true label to survey
df["survey_group"] = df.apply(
    lambda x: f"{x['label']} - {core_pubs[core_pubs['Survey'] == x['pub_id']]['Group'].values[0]}"
    if x["label"] == "Survey"
    else "",
    axis=1,
)

fig = px.scatter(
    df,
    x="x",
    y="y",
    color="label",
    hover_data=["label", "cluster", "pub_id", "survey_group"],
)
fig.update_traces(marker_opacity=0.9)
fig.show()