#### The goal of this notebook is to determine how much information are we losing by using UMAP to visualize the data.

In [1]:
from sklearn.metrics import accuracy_score
from litQeval.eval_utils import *
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import plotly.io as pio
import numpy as np
import umap
import json
pio.templates.default = "seaborn"
COLORS = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
          '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
PLOT_CONFIGS = dict(
    title_x=0.5, title_font_size=25, title_font_family="Modern Computer", font_family="Modern Computer",
    xaxis_title="", yaxis_title="", showlegend=True, legend_title="",
    xaxis_tickfont_size=15, yaxis_tickfont_size=15, legend_font_size=20, legend_itemsizing="constant",
    legend_orientation="h", legend_yanchor="bottom", legend_y=-0.3, legend_xanchor="center", legend_x=0.5
)

In [2]:
topics = [i["baseline"] for i in json.load(open('data/queries.json'))]

In [3]:
umap_data = []
embeddings_data = []
for idx, topic in tqdm(enumerate(topics), total=len(topics)):
    data = get_data(topic, None)
    core_pubs = data["core_pubs"]
    core_mean_embedding = data["core_mean_embedding"]
    baseline_pubs = data["baseline_pubs"]
    predicted_pubs = data["predicted_pubs"]
    predicted_vs = data["predicted_vs"]
    baseline_vs = data["baseline_vs"]
    core_vs = data["core_vs"]
    core_embeddings = data["core_embeddings"]
    core_threshold = data["core_threshold"]
    df = pd.concat([baseline_pubs, predicted_pubs])
    df["Source"] = "Predicted"
    df.loc[df.duplicated(subset=["id"]), "Source"] = "Baseline"
    df.drop_duplicates(subset=["id"], inplace=True, keep="last")

    # ids in baseline but not in predicted
    missing_ids = set(baseline_pubs["id"]) - set(predicted_pubs["id"])
    pub_ids = list(set(df["id"].values.tolist()) - missing_ids)
    embeddings_dict = {}
    for i in range(0, len(pub_ids), 25000):
        items = predicted_vs.get(pub_ids[i:i+25000], include=["embeddings"])
        for j in range(len(items["ids"])):
            embeddings_dict[items["ids"][j]] = items["embeddings"][j]

    # get the missing embeddings from baseline_vs
    for i in missing_ids:
        items = baseline_vs.get(i, include=["embeddings"])
        for j in range(len(items["ids"])):
            embeddings_dict[items["ids"][j]] = items["embeddings"][j]

    # sort the df in the same order as embeddings.keys()
    custom_sorting = {k: v for v, k in enumerate(embeddings_dict.keys())}
    df.sort_values(by="id", key=lambda x: x.map(custom_sorting), inplace=True)
    embeddings = np.array([embeddings_dict[i] for i in df["id"]])
    umap_embeddings = umap.UMAP(metric="cosine").fit_transform(np.vstack([core_embeddings, embeddings]))

    umap_data.append([umap_embeddings, topic])
    embeddings_data.append([embeddings, topic])

100%|██████████| 22/22 [04:13<00:00, 11.52s/it]


In [4]:
umap_df = pd.concat(
    [pd.DataFrame(umap, columns=["x", "y"]).assign(topic=topic) for umap, topic in umap_data],
    ignore_index=True
)
embeddings_df = pd.concat(
    [pd.DataFrame(embeddings).assign(topic=topic) for embeddings, topic in embeddings_data],
    ignore_index=True
)

In [5]:
import gc
# clear all prior data
del umap_data, embeddings_data, df, embeddings_dict, core_embeddings, core_pubs, core_mean_embedding, baseline_pubs, predicted_pubs, predicted_vs, baseline_vs, core_vs
gc.collect()

22551

In [8]:
X = embeddings_df.drop(columns="topic")
y = embeddings_df["topic"].astype("category").cat.codes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

clf = XGBClassifier(random_state=42, device="cuda", tree_method="hist", enable_categorical=True)
clf.fit(X_train, y_train, verbose=True)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"The accuracy of the embeddings is {accuracy:.2f}")

The accuracy of the embeddings is 0.83


In [9]:
X = umap_df.drop(columns="topic")
y = umap_df["topic"].astype("category").cat.codes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

clf = XGBClassifier(random_state=42, device="cuda", tree_method="hist", enable_categorical=True)
clf.fit(X_train, y_train, verbose=True)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"The accuracy of the UMAP embeddings is {accuracy:.2f}")

The accuracy of the UMAP embeddings is 0.47
