In [160]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_chroma import Chroma
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import pandas as pd
import numpy as np

pio.templates.default = "seaborn"
PLOT_CONFIGS = dict(
    title_x=0.5, title_font_size=35, title_font_family="Modern Computer", font_family="Modern Computer",
    xaxis_title="", yaxis_title="", showlegend=True, legend_title="",
    xaxis_tickfont_size=15, yaxis_tickfont_size=15, legend_font_size=15,
    xaxis_title_font_size=20, yaxis_title_font_size=20,
    )
SLR_TOPICS = ["Software Process Line", "Data Stream Processing Latency", "Business Process Meta Models", "Multicore Performance Prediction", "Cloud Migration", "Software Fault Prediction Metrics", "Software Defect Prediction"]


In [161]:
df = pd.read_excel("./data/metadata.xlsx")
core_pubs = pd.read_excel("./data/core_publications.xlsx")

In [162]:
documents = []
pub_ids = []
# id, title and abstract as meta data
for i, row in df.iterrows():
    pub_id = row["id"]
    group = core_pubs[core_pubs["Pub_id"] == pub_id]
    if group.empty:
        group = "Survey"
    else:
        group = group["Topic"].values[0]
    content = f'Title: {row["title"]}\nAbstract: {row["abstract"]}'
    doc = Document(page_content=content, metadata={"id": row["id"], "topic": group})
    documents.append(doc)
    pub_ids.append(row["id"])

def create_embeddings(documents, ids):
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    vs = Chroma("core_publications", embedding_model, persist_directory="./data/vs/core_publications")
    vs.add_documents(documents, ids=ids)
    return vs
# create_embeddings(documents, pub_ids)

In [163]:
collection = Chroma("core_publications", persist_directory="./data/vs/core_publications")
pub_ids = collection.get()["ids"]
all_embeddings = []
labels = []
for i in pub_ids:
    all_embeddings.append(collection.get(i, include=["embeddings"])["embeddings"])
    labels.append(collection.get(i)["metadatas"][0]["topic"])

all_embeddings = np.squeeze(all_embeddings)
n_clusters = len(core_pubs["Topic"].unique())
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(all_embeddings)
cluster_labels = kmeans.labels_

In [168]:
import umap

umap_embeddings = umap.UMAP(min_dist=0.3).fit_transform(
    all_embeddings
)

df = pd.DataFrame(umap_embeddings, columns=["x", "y"])
df["label"] = labels
df["cluster"] = cluster_labels
df["pub_id"] = pub_ids
# Add true label to survey
df["survey_group"] = df.apply(
    lambda x: f"{x['label']} - {core_pubs[core_pubs['Survey'] == x['pub_id']]['Topic'].values[0]}"
    if x["label"] == "Survey"
    else "",
    axis=1,
)
df["Type"] = "Bibliometric Analysis"
df.loc[df["label"].isin(SLR_TOPICS), "Type"] = "Systematic Literature Review"

In [169]:
fig = px.scatter(
    df[(df["label"] != "Survey")],
    x="x",
    y="y",
    color="label",
    hover_data=["label", "cluster", "pub_id", "survey_group"],
    labels={"label": "Topic"},
)

# Add survey 
fig.add_trace(
    go.Scatter(
        x=df[df["label"] == "Survey"]["x"],
        y=df[df["label"] == "Survey"]["y"],
        mode="markers",
        marker=dict(color="black"),
        name="Survey",
        hoverinfo="text",
        text=df[df["label"] == "Survey"]["survey_group"],
    )
)
fig.update_traces(marker_opacity=0.9)
fig.update_layout(
    title="Clustering of Core Publications",
    **PLOT_CONFIGS, legend_orientation="h", legend_y=-0.2
)

# update markers of traces that has SLR topics
for i in SLR_TOPICS:
    fig.for_each_trace(
        lambda trace: trace.update(marker=dict(symbol="x") if trace.name == i else {})
    )
fig.show()
# pio.write_image(fig, "umap_clustering.pdf")