 #### The goal of this notebook to create the embeddings for the core publications and visualize their semantic menaing in the embedding space

In [28]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_chroma import Chroma
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from textwrap import shorten
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import pandas as pd
import numpy as np
import umap

pio.templates.default = "seaborn"
COLORS = [
    '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
    '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
    '#ff33cc', '#00cc99', '#ffcc00', '#3399ff', '#9933cc',
    '#66ff66', '#ff0066', '#669999', '#996633', '#00cccc'
]
PLOT_CONFIGS = dict(
    title_x=0.5, title_font_size=30, title_font_family="Modern Computer", font_family="Modern Computer",
    xaxis_title="", yaxis_title="", showlegend=True, legend_title="",
    xaxis_tickfont_size=5, yaxis_tickfont_size=5, legend_font_size=15,
    xaxis_title_font_size=10, yaxis_title_font_size=10,
    width=750, height=400,
    # legend_orientation="h", legend_yanchor="bottom", legend_y=-0.5, legend_xanchor="center", legend_x=0.5
    )
SLR_TOPICS = ["Software Process Line", "Data Stream Processing Latency", "Business Process Meta Models", "Multicore Performance Prediction", "Cloud Migration", "Software Fault Prediction Metrics", "Software Defect Prediction"]


In [29]:
df = pd.read_excel("./data/metadata.xlsx")
core_pubs = pd.read_excel("./data/core_publications.xlsx")

In [30]:
documents = []
pub_ids = []
# id, title and abstract as meta data
for i, row in df.iterrows():
    pub_id = row["id"]
    group = core_pubs[core_pubs["Pub_id"] == pub_id]
    if group.empty:
        group = "Survey"
    else:
        group = group["Topic"].values[0]
    content = f'Title: {row["title"]}\nAbstract: {row["abstract"]}'
    doc = Document(page_content=content, metadata={"id": row["id"], "topic": group})
    documents.append(doc)
    pub_ids.append(row["id"])

def create_embeddings(documents, ids):
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    vs = Chroma("core_publications", embedding_model, persist_directory="./data/vs/core_publications")
    vs.add_documents(documents, ids=ids)
    return vs
# create_embeddings(documents, pub_ids)

In [31]:
collection = Chroma("core_publications", persist_directory="./data/vs/core_publications")
pub_ids = collection.get()["ids"]
all_embeddings = []
labels = []
for i in pub_ids:
    all_embeddings.append(collection.get(i, include=["embeddings"])["embeddings"])
    labels.append(collection.get(i)["metadatas"][0]["topic"])

all_embeddings = np.squeeze(all_embeddings)
n_clusters = len(core_pubs["Topic"].unique())
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(all_embeddings)
cluster_labels = kmeans.labels_

umap_embeddings = umap.UMAP(metric="cosine",random_state=0).fit_transform(
    all_embeddings
)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [32]:
plot_df = pd.DataFrame(umap_embeddings, columns=["x", "y"])
plot_df["label"] = labels
plot_df["cluster"] = cluster_labels
plot_df["pub_id"] = pub_ids
plot_df = plot_df.merge(df, left_on="pub_id", right_on="id")
plot_df["survey_group"] = plot_df.apply(
    lambda x: f"{x['label']} - {core_pubs[core_pubs['Survey'] == x['pub_id']]['Topic'].values[0]}"
    if x["label"] == "Survey"
    else "",
    axis=1,
)
plot_df["Type"] = "Bibliometric Analysis"
plot_df.loc[plot_df["label"].isin(SLR_TOPICS), "Type"] = "Systematic Literature Review"

In [33]:

plot_df["abstract"] = plot_df["abstract"].astype(str).str.wrap(150)
plot_df["abstract"] = plot_df["abstract"].apply(lambda x: x.replace("\n", "<br>"))
plot_df["abstract"] = plot_df["abstract"].apply(lambda x: shorten(x, width=600))
fig = px.scatter(
    plot_df[(plot_df["label"] != "Survey")],
    x="x",
    y="y",
    color="label",
    # hover_data=["label", "cluster", "pub_id", "survey_group","Title"],
    hover_data=["title", "abstract"],
    labels={"label": "Topic"},
    color_discrete_sequence=COLORS,
)

# Add survey 
fig.add_trace(
    go.Scatter(
        x=plot_df[plot_df["label"] == "Survey"]["x"],
        y=plot_df[plot_df["label"] == "Survey"]["y"],
        mode="markers",
        marker=dict(color="black"),
        name="Survey",
        hoverinfo="text",
        text=plot_df[plot_df["label"] == "Survey"]["survey_group"],
    )
)
fig.update_traces(marker_size=5)
fig.update_layout(
    title="Clustering of Core Publications",
    **PLOT_CONFIGS, legend_orientation="h", legend_y=0,
    xaxis_showticklabels=False,
    yaxis_showticklabels=False,

)

# update markers of traces that has SLR topics
for i in SLR_TOPICS:
    fig.for_each_trace(
        lambda trace: trace.update(marker=dict(symbol="x") if trace.name == i else {})
    )
fig.show()
# pio.write_image(fig, "LitQEval-report/pics/umap_clustering.pdf", width=1250)