In [1]:
from utils import save_plot, read_data
from textwrap import shorten
import plotly.express as px
from typing import Literal
import pandas as pd
import polars as pl
import numpy as np
import umap

In [2]:
def models_comparision_results(recall: Literal["10k", "1k"], save: bool = False):
    df = read_data(f"models_comparision_recall{recall}").set_index("Topic")
    fig = px.imshow(df, color_continuous_scale="RdBu", text_auto="auto")
    layout = dict(
        title_font_size=20,
        xaxis_tickfont_size=15,
        yaxis_tickfont_size=15,
        xaxis_title_text="Models",
        yaxis_title_text="Topic",
        legend_font_size=15,
        yaxis_title_font_size=20,
        xaxis_title_font_size=20,
        height=800,
        width=1000,
        title=f"Models Comparative Experiment Recall@{recall} Result",
        margin=dict(l=250),
    )
    fig.update_layout(**layout)
    if save:
        save_plot(fig, f"embedding-models-experiment-recall{recall}", layout)

    fig.show()
# models_comparision_results("1k")

In [3]:
def evaluation_results(
    save: bool = False,
    synthetic_core: bool = False,
    limit_k: bool = False,
    all_topics: bool = False,
    baseline: bool = False,
):
    if baseline:
        if limit_k:
            df = read_data("baseline_results_by_k")
        else:
            df = read_data("baseline_results_10k")
    elif synthetic_core:
        if limit_k:
            df = read_data("synthetic_core_results_by_k")
        else:
            df = read_data("synthetic_core_results_10k")
    else:
        if limit_k:
            df = read_data("post_input_optimization_results_by_k")
        else:
            df = read_data("post_input_optimization_results_10k")
        
    if not all_topics:
        df = df[df["slr_query_results"] != 0].dropna()
    df["Topic"] = df["Topic"].apply(lambda x: shorten(x, width=30, placeholder="..."))
    synth_topics = [
        "Pharmacokinetics and...",
        "The rodent object-in-...",
        "Specialized psychotherapies...",
        "Coronary heart disease,...",
        "Long-term Outcomes of...",
        "Patients Retransitioning...",
        "The methodological rigour...",
    ]
    df["is_synthetic"] = df["Topic"].apply(
        lambda x: 1 if x in synth_topics else 0
    )  # 1 for synthetic, 0 for real
    df = (
        df.sort_values(["is_synthetic", "Sample Size"], ascending=[True, True])
        .rename(
            {
                "paperseek_results": "PaperSeek",
                "Sample Size": "K",
                "slr_query_results": "SLR Query",
            },
            axis=1,
        )
        .drop(["is_synthetic"], axis=1)
    )
    df[["PaperSeek", "SLR Query"]] = df[["PaperSeek", "SLR Query"]] * 100
    avg_row = df.mean(numeric_only=True).to_frame().T
    df = pd.concat([df, avg_row], ignore_index=True)
    df.at[df.index[-1], "Topic"] = "Average"
    if all_topics:
        df.fillna(0, inplace=True)
        df["SLR Query"] = df["SLR Query"].replace(0, -1)
    df[["PaperSeek", "SLR Query", "K"]] = df[["PaperSeek", "SLR Query", "K"]].round().astype(int)
    display(
        df.reset_index(drop=True)
        .style.highlight_max(axis=1, subset=["PaperSeek", "SLR Query"], color="green")
    )
    clms_to_remove = []

    if all_topics:
        clms_to_remove.append("SLR Query")
    if not limit_k:
        clms_to_remove.append("K")

    if clms_to_remove:
        df = df.drop(columns=clms_to_remove)

    if save:
        df.to_latex(
            "test.tex",
            index=False,
        )

In [4]:
evaluation_results(synthetic_core=True, all_topics=True, limit_k=False) # Only queries and limited to k

Unnamed: 0,Topic,PaperSeek,SLR Query,K
0,Software Process Line,100,42,167
1,Cerebral Small Vessel...,100,44,982
2,Business Process Meta Models,70,90,1598
3,Data Stream Processing Latency,68,34,1907
4,Bayesian PTSD-Trajectory...,37,-1,6395
5,Cloud Migration,100,32,7909
6,Bayesian Versus Frequentist...,96,-1,10000
7,Comparative Efficacy and...,75,-1,10000
8,Comparing Experimental...,53,9,10000
9,Cross-sectional relation of...,88,-1,10000


In [5]:
def computional_hours(save: bool = False):
    # TODO: Add Qdrant Migration and Active Learning Stimulation
    df = pd.DataFrame(
        {
            "Task": [
                "OpenAlex Encoding",
                "Embedding Models Comparative Analysis",
                "Query Structure Ablation",
                "CPs Effect Ablation",
                "SLRs Evaluation",
                "Reranking Ablation",
                "Active Learning Simulation",
            ],
            "Time": [221, 40, 1.5, 12.8, 2.6, 25, 1],
            "Type": ["GPU", "GPU", "IO", "IO", "IO", "GPU", "IO"],
        }
    )

    fig = px.bar(
        df,
        x="Task",
        y="Time",
        color="Type",
        title="Computional Hours",
        text="Time",
    )
    fig.update_traces(texttemplate="%{text:.2s}", textposition="outside")
    layout = dict(
        title_font_size=20,
        xaxis_tickfont_size=15,
        yaxis_tickfont_size=15,
        xaxis_title_text="Task",
        yaxis_title_text="Time (Hours)",
        legend_title_text="Resource",
        legend_font_size=15,
        yaxis_title_font_size=20,
        xaxis_title_font_size=20,
        height=800,
        width=1000,
        title="Computional Hours",
        margin=dict(b=250),
        xaxis=dict(
            categoryorder="total descending",
        ),
    )
    fig.update_layout(**layout)
    if save:
        save_plot(fig, "computional_hours", layout)
    fig.show()
    
# computional_hours()

In [6]:
def cps_performance_distribution(save: bool = False):
    df = read_data("cp_performance_distribution")
    fig = px.box(df, x="topic", y="recall", points="all",
                 hover_data="core_id")
    layout = dict(
        title_font_size=20,
        xaxis_tickfont_size=15,
        yaxis_tickfont_size=15,
        xaxis_title_text="Topic",
        yaxis_title_text="Recall",
        legend_font_size=15,
        yaxis_title_font_size=20,
        xaxis_title_font_size=20,
        height=600,
        width=1250,
        xaxis_tickangle=-45,
        title="Recall Distribution by CP",
        margin=dict(b=250, l=150),
    )
    fig.update_layout(**layout)
    fig.update_traces(marker=dict(size=5, line=dict(width=0.2, color="DarkSlateGrey")))

    if save:
        save_plot(fig, "cps_performance_distribution", layout)
    fig.show()

# cps_performance_distribution(True)

In [7]:
def recall_over_topn(save: bool = False):
    df = read_data("recall_change_over_topn")
    fig = px.line(
        df,
        x="Top N",
        y="Recall",
        color="CP Type",
    )
    layout = dict(
        title_font_size=20,
        xaxis_tickfont_size=15,
        yaxis_tickfont_size=15,
        xaxis_title_text="K",
        yaxis_title_text="Recall",
        legend_title_text="CP Type",
        legend_font_size=15,
        yaxis_title_font_size=20,
        xaxis_title_font_size=20,
        xaxis=dict(
            tickvals=[i for i in range(0, 10001, 1000)],
        ),
        
        title="Recall@K for Different CP Types",
    )
    fig.update_layout(**layout)
    if save:
        save_plot(fig, "recall_over_topn", layout)
    fig.show()
# recall_over_topn()

In [8]:
def active_learning(save: bool = False):
    df = read_data("active_learning").rename(
        columns={
            "Multilayer Perceptron": r"$\text{Neural Network}$",
            "Percent Reviewed": "K",
            "Baseline": r"$\text{PaperSeek}_o$",
            "Avg. Vector": r"$\text{Average Vector}$",
            "Asreview": r"$\text{ASReview}$",
            "SVM": r"$\text{SVM}$",
        }
    )
    df["K"] = df["K"] * 10000
    fig = px.line(
        df.loc[::100],  # Smooth out
        x="K",
        y=df.columns[1:],
        line_shape="spline",
    )
    layout = dict(
        title_font_size=30,
        xaxis_tickfont_size=20,
        yaxis_tickfont_size=20,
        yaxis_title_text="Recall",
        legend_title_text="Method",
        legend_font_size=25,
        yaxis_title_font_size=25,
        xaxis_title_font_size=25,
        xaxis=dict(
            tickvals=[i for i in range(0, 10001, 1000)],
        ),
        height=600,
        width=1250,
        title="Active Learning Simulation",
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=.07,
            xanchor="center",

        ),
    )
    fig.update_layout(**layout)
    if save:
        save_plot(fig, "reranking_recall_at_k", layout)
    fig.show()
    # values at 1000, 2000, 3000, 4000, 5000
    for k in [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]:
        print(f"Recall@{k}:")
        display(df[df["K"] == k].iloc[:, 1:].mean(numeric_only=True).to_frame().T.round(3))
# active_learning(False)

In [None]:
from sklearn.cluster import DBSCAN, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

def lda_to_single_topic(topic:str, keywords: list[str]):
    from openai import OpenAI
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {
                "role": "system",
                "content": """
                Given a topic and a list of keywords, generate a single topic that encompasses the keywords.
                
                You respoonse should contain the topic nad the topic only.
                The topic is at max 6 words long.
                Do not include any additional text or explanations.
                """,
            },
            {
                "role": "user",
                "content": f"Topic: {topic}\nKeywords: {', '.join(keywords)}\nGenerate a single topic that encompasses these keywords.",
            },
        ],
        max_tokens=50,
        temperature=0.7,
    )
    return response.choices[0].message.content.strip()

def umap_figure(save: bool = False):
    # Read and prepare data
    ids = (
        pl.scan_parquet("ablation/final/cp_random_hyde_1_run-1.parquet")
        .filter(pl.col("topic") == "Cloud Migration")
        .select("id")
        .collect()
        .to_numpy()
        .flatten()
        .tolist()
    )
    df = (
        pl.scan_parquet("ablation/active_learning/embeddings.parquet")
        .filter(pl.col("id").is_in(ids)).with_columns(
            docs = pl.concat_str(
                pl.col("title"),
                pl.col("abstract"),
                separator=" ",
            )
        )
        .select("id", "embedding", "docs").collect()
    )
    data = np.vstack(df.select("embedding").to_numpy().flatten())
    fit = umap.UMAP()
    u = fit.fit_transform(data)
    df = df.with_columns(x=u[:,0], y=u[:,1])
    clustering = DBSCAN(eps=0.5, min_samples=5).fit(df.select("x", "y").to_numpy())
    df = df.with_columns(cluster=pl.Series(clustering.labels_, dtype=pl.Utf8))


    # Process Figures
    cluster_topic_mapping = {}
    for cluster in df.select("cluster").unique().to_series().to_list():
        cluster_docs = df.filter(pl.col("cluster") == cluster).select("docs").to_series().to_list()
        vectorizer = TfidfVectorizer(stop_words="english")
        tfidf_matrix = vectorizer.fit_transform(cluster_docs)
        lda = LDA(n_components=1, random_state=42)
        lda.fit(tfidf_matrix)
        feature_names = vectorizer.get_feature_names_out()
        topic_keywords = np.array(feature_names)[np.argsort(lda.components_[0])[-10:]]
        keywords = topic_keywords.tolist()
        single_topic = lda_to_single_topic("Cloud Migration", keywords)
        cluster_topic_mapping[cluster] = single_topic

    fig = px.scatter(
        df.to_pandas(),
        x="x",
        y="y",
        title="UMAP Projection of Embeddings (Cloud Migration)",
    )
    
    df = df.with_columns(cluster=pl.col("cluster").replace(cluster_topic_mapping))
    fig_with_topics = px.scatter(
        df.to_pandas(),
        x="x",
        y="y",
        color="cluster",
        title="Topic Clusters in UMAP Space (Cloud Migration)"
        )
    
    layout = dict(
        title_font_size=20,
        xaxis_tickfont_size=15,
        yaxis_tickfont_size=15,
        xaxis_title_text="",
        yaxis_title_text="",
        yaxis_title_font_size=20,
        xaxis_title_font_size=20,
        height=800,
        width=1000,
        xaxis_showticklabels=False,
        yaxis_showticklabels=False,
        legend_orientation="h",
        legend_yanchor="bottom",
        legend_y=-0.2,
        legend_xanchor="center",
        legend_x=0.5,
        legend_title="Topic Clusters in UMAP Space",
        legend_title_text="Topics",
        legend_font_size=11,
        legend_itemsizing="constant",
    )
    marker=dict(
        size=3,
        opacity=0.8,
    )
    fig.update_layout(**layout)
    fig.update_traces(marker=marker)

    fig_with_topics.update_layout(**layout)
    fig_with_topics.update_traces(marker=marker)
    fig.show()
    fig_with_topics.show()

    if save:
        save_plot(fig, "umap_projection", layout)
        save_plot(fig_with_topics, "umap_projection_with_topics", layout)


umap_figure(False)