In [None]:
import os
from uuid import uuid4
from glob import glob
from pathlib import Path
from collections import defaultdict
from typing import Generator, Literal, Optional, Sequence

from asreview import ASReviewData, ASReviewProject, open_state  # noqa: E402
from asreview.models.classifiers import NaiveBayesClassifier  # noqa: E402
from asreview.models.feature_extraction import Tfidf  # noqa: E402
from asreviewcontrib.insights.metrics import recall  # noqa: E402
from asreview.models.balance import DoubleBalance  # noqa: E402
from asreview.models.query import MaxQuery  # noqa: E402
from asreview.review import ReviewSimulate  # noqa: E402

import torch
import numpy as np
import polars as pl
from torch import nn
from tqdm.auto import tqdm
from sklearnex import patch_sklearn, set_config
# gpu
set_config(target_offload="gpu")
patch_sklearn()
from sklearn.svm import SVC  # noqa: E402
from sentence_transformers import util # noqa: E402
from lightgbm.sklearn import LGBMClassifier # noqa: E402
from catboost import CatBoostClassifier, Pool  # noqa: E402
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  # noqa: E402

from notebooks.ablations.utils import TOPICS_UNDER_5K, TOPICS_UNDER_10K # noqa: E402
from utils import DataReader # noqa: E402



In [None]:
class MLPClassifier(nn.Module):
    def __init__(
        self,
        input_dim: int,
        hidden_dim: int = 64,
        output_dim: int = 1,
        dropout_rate: float = 0.5,
        weight_decay_rate: float = 1e-3,
    ):
        super(MLPClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, output_dim),
        )
        self.loss_fn = nn.BCELoss()
        self.sigmoid = nn.Sigmoid()
        self.optimizer = torch.optim.Adam(
            self.parameters(), weight_decay=weight_decay_rate
        )
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.sigmoid(self.model(x))

    def fit(self, x: np.ndarray, y: np.ndarray, epochs=150):
        self.train()
        x_tensor = torch.tensor(x, dtype=torch.float32).to(self.device)
        y_tensor = torch.tensor(y, dtype=torch.float32).to(self.device).float()
        for epoch in range(epochs):
            self.optimizer.zero_grad()
            outputs = self.forward(x_tensor)
            loss = self.loss_fn(outputs, y_tensor.view(-1, 1))
            loss.backward()
            self.optimizer.step()
        return self

    def predict_proba(self, x: np.ndarray) -> np.ndarray:
        self.eval()
        x_tensor = torch.tensor(x, dtype=torch.float32).to(self.device)
        with torch.no_grad():
            outputs = self.forward(x_tensor)
            return outputs.cpu().numpy().flatten()


In [None]:
eval_df = pl.read_excel("data/eval_cps.xlsx")
data = pl.read_parquet("ablation/hyde/best/All_HYDE0.parquet")
topic_items = {}
for topic in data.select("topic").unique().to_numpy().flatten():
    topic_items[topic] = {
        "id": data.filter(pl.col("topic") == topic).select("id").to_numpy().flatten().tolist(),
        "score": data.filter(pl.col("topic") == topic).select("score").to_numpy().flatten().tolist()
    }

In [None]:
def get_data(df: pl.DataFrame, new=False):
    path = "ablation/active_learning/embeddings.parquet"
    if os.path.exists(path) and not new:
        return pl.read_parquet(path)

    work_ids = list(set(df.select("id").to_numpy().flatten().tolist()))
    reader = DataReader()
    results = {"id": [], "embedding": [], "title": [], "abstract": []}
    for batch in reader.scan_batches():
        items = (
            batch.filter(pl.col("id").is_in(work_ids))
            .select("id", "embedding", "title", "abstract")
            .collect()
        )
        results["id"].extend(items.select("id").to_numpy().flatten().tolist())
        results["embedding"].extend(items.select("embedding").to_numpy().flatten().tolist()
        )
        results["title"].extend(items.select("title").to_numpy().flatten().tolist())
        results["abstract"].extend(items.select("abstract").to_numpy().flatten().tolist()
        )
    df = pl.DataFrame(results)
    if not new:
        df.write_parquet("ablation/active_learning/embeddings.parquet")
    return df

In [None]:
def get_topic_data(df: pl.DataFrame, topic: str, topic_items: dict[dict]) -> pl.DataFrame:
    cps_id = (
        eval_df.filter(pl.col("topic") == topic)
        .select("id")
        .to_numpy()
        .flatten()
        .tolist()
    )
    df = (
        df.filter(pl.col("id").is_in(topic_items[topic]["id"]))
        .with_columns(
            pl.when(pl.col("id").is_in(cps_id)).then(1).otherwise(0).alias("label")
        )
        .unique(["title", "abstract"])
    )
    scores_df = pl.DataFrame(
        {"id": topic_items[topic]["id"], "score": topic_items[topic]["score"]}
    )
    df = df.join(scores_df, "id").rename({"id": "url"}).sort("score", descending=True)
    return df

def parse_data(
    df: pl.DataFrame, topics: dict[dict[str, list[str]]] = topic_items
) -> Generator[tuple[str, pl.DataFrame, tuple, tuple, tuple], None, None]:
    """
    Parse the data into a generator of tuples containing corpus, positive publications, and negative publications.

    Parameters
    ----------
    df : pl.DataFrame
        The input dataframe containing the data to be parsed.

    Returns
    -------
    Generator[tuple, None, None]
        A generator yielding tuples of (corpus, positive publications, negative publications).
    """
    pbar = tqdm(topics)
    for topic in pbar:
        pbar.set_description(topic)
        topic_df = get_topic_data(df, topic, topics)
        corpus_embeddings = np.vstack(topic_df.select("embedding").to_numpy().flatten())
        ids = topic_df.select("url").to_numpy().flatten().tolist()
        corpus = tuple(zip(ids, corpus_embeddings))
        positive_embeddings = np.vstack(
            topic_df.filter(pl.col("label") == 1)
            .select("embedding")
            .to_numpy()
            .flatten()
        )
        positive_ids = (
            topic_df.filter(pl.col("label") == 1)
            .select("url")
            .to_numpy()
            .flatten()
            .tolist()
        )
        positive_pubs = tuple(zip(positive_ids, positive_embeddings))

        negative_embeddings = np.vstack(
            topic_df.filter(pl.col("label") == 0)
            .select("embedding")
            .to_numpy()
            .flatten()
        )
        negative_ids = (
            topic_df.filter(pl.col("label") == 0)
            .select("url")
            .to_numpy()
            .flatten()
            .tolist()
        )
        negative_pubs = tuple(zip(negative_ids, negative_embeddings))

        yield topic, topic_df, corpus, positive_pubs, negative_pubs


In [None]:
def average_vector_strategy(
    positive_vectors: Sequence[np.ndarray],
    negative_vectors: Optional[Sequence[np.ndarray]] = None
) -> np.ndarray:

    if len(positive_vectors) == 0:
        raise ValueError("At least one positive vector is required.")

    if positive_vectors.ndim == 1:
        positive_vectors = positive_vectors.reshape(1, -1)
    pos_stack = np.stack(positive_vectors)
    avg_pos = pos_stack.mean(axis=0)

    if negative_vectors is not None:
        if len(negative_vectors) == 0:
            raise ValueError("Negative vectors sequence provided but empty.")
        
        if negative_vectors.ndim == 1:
            negative_vectors = negative_vectors.reshape(1, -1)

        neg_stack = np.stack(negative_vectors)
        avg_neg = neg_stack.mean(axis=0)

        search_vec = (avg_pos * 2) - avg_neg
    else:
        search_vec = avg_pos

    return search_vec

In [None]:
def get_similiar_pubs(
    query_vector: np.ndarray,
    search_embeddings: np.ndarray,
    search_ids: list[str],
) -> str | None:
    hits = util.semantic_search(query_vector, search_embeddings, top_k=1)
    max_idx = hits[0][0]["corpus_id"]
    return search_ids[max_idx]

In [None]:
SVM = SVC(probability=True, verbose=False)

def predict_next_hit(
    positive_vectors: Sequence[np.ndarray],
    negative_vectors: Optional[Sequence[np.ndarray]],
    search_embeddings: np.ndarray,
    search_ids: list[str],
    prediction_method: Literal["catboost", "mlp", "lgbm", "avg-vector", "svm", "lda"] = "avg-vector",
):
    train_x = np.vstack([positive_vectors, negative_vectors])
    train_y = ([1] * positive_vectors.shape[0]) + ([0] * negative_vectors.shape[0])
    if prediction_method == "catboost":
        train_pool = Pool(train_x, train_y)
        model = CatBoostClassifier(iterations=50, verbose=False)
        model.fit(train_pool)
        y_preds = model.predict_proba(search_embeddings)[:, 1].flatten()
        
    elif prediction_method == "mlp":
        model = MLPClassifier(input_dim=1024)
        model = model.fit(train_x, train_y)
        y_preds = model.predict_proba(search_embeddings)

    elif prediction_method == "lgbm":
        model = LGBMClassifier(verbose=-1)
        model.fit(train_x, train_y)
        y_preds = model.predict_proba(search_embeddings)[:,1].flatten()
    
    elif prediction_method == "svm":
        if train_x.shape[0] % 10 == 0 or train_x.shape[0] == 2:
            SVM.fit(train_x, train_y)
        y_preds = SVM.predict_proba(search_embeddings)[:,1].flatten()
    
    elif prediction_method == "lda":
        model = LinearDiscriminantAnalysis()
        model.fit(train_x, train_y)
        y_preds = model.predict_proba(search_embeddings)[:,1].flatten()

    elif prediction_method == "avg-vector":
        query_vector = average_vector_strategy(positive_vectors, negative_vectors)
        hit = get_similiar_pubs(query_vector, search_embeddings, search_ids)
        return hit
    else:
        raise ValueError(f"Unsupported model type: {prediction_method}")

    best_idx = np.argmax(y_preds)
    return search_ids[best_idx]

In [None]:
def simulate(
    positive_pubs: list[tuple[str, np.ndarray]],
    negative_pubs: list[tuple[str, np.ndarray]],
    corpus: list[tuple[str, np.ndarray]],
    topic: str,
    max_iterations: Optional[int] = 10_000,
    predicition_method: Literal["avg-vector", "catboost", "mlp", "lgbm", "svm", "lda"] = "avg-vector",
    verbose: bool = False,
) -> list[float]:
    # --- Pre-processing ---
    corpus_dict = {item[0]: item[1] for item in corpus}
    all_corpus_ids = set(corpus_dict.keys())

    positive_target_ids = {i[0] for i in positive_pubs}
    negative_target_ids = {i[0] for i in negative_pubs}

    # Initialize selected sets with the first element
    selected_positive_set = {positive_pubs[0][0]}
    selected_negative_set = {negative_pubs[0][0]}

    # Initialize set of IDs available for searching
    available_ids = all_corpus_ids - selected_positive_set - selected_negative_set
    results = []
    idx = 0

    topic_actual_cps = eval_df.filter(pl.col("topic")==topic).shape[0]
    print(f"Starting stimulation. Target: {topic_actual_cps} positive pubs.")
    if len(positive_target_ids) != topic_actual_cps:
        print("The Top@K retreived CPs is not equal to the actual CPS. Consider increasing K.")
        print(f"Target: {len(positive_target_ids)} CPS. Actual: {topic_actual_cps} CPS.")

    if verbose:
        print(f"Initial positive: {selected_positive_set}")
        print(f"Initial negative: {selected_negative_set}")
        print(f"Available for search: {len(available_ids)} pubs.")

    while not positive_target_ids.issubset(selected_positive_set):
        pos_vec_list = [corpus_dict[pid] for pid in selected_positive_set]
        neg_vec_list = [corpus_dict[nid] for nid in selected_negative_set]

        # Check if lists are empty before vstack
        positive_vectors = np.vstack(pos_vec_list)
        negative_vectors = np.vstack(neg_vec_list)

        search_ids = list(available_ids)
        search_embeddings = np.vstack([corpus_dict[id] for id in search_ids])

  
        hit = predict_next_hit(
            positive_vectors,
            negative_vectors,
            search_embeddings,
            search_ids,
            prediction_method=predicition_method,
        )

        # Remove hit from available set
        available_ids.remove(hit)

        if hit in negative_target_ids:
            selected_negative_set.add(hit)
        else:
            selected_positive_set.add(hit)

        idx += 1
        
        recall = len(selected_positive_set) / topic_actual_cps #len(positive_target_ids)
        results.append(recall)

        if idx % 50 == 0 and verbose:
            print(
                f"Iteration: {idx}, Recall: {round(recall, 3)}, Available: {len(available_ids)}"
            )

        if idx >= max_iterations:
            print(f"Stopping: Reached max iterations ({max_iterations})")
            break

    print(
        f"Final Pos Found: {len(selected_positive_set)}, Iterations: {len(selected_negative_set)}"
    )

    # If the results are less than max_iterations long, then fill the rest with recall of 1.
    if len(results) < max_iterations:
        for i in range(len(results), max_iterations):
            results.append(1.0)

    return results

In [None]:
def simulate_baseline(
    positive_pubs: list[tuple[str, np.ndarray]],
    corpus: list[tuple[str, np.ndarray]],
):
    n_cps = len(positive_pubs)
    positive_ids = [i[0] for i in positive_pubs]
    corpus_ids = [i[0] for i in corpus]
    corpus_df = pl.DataFrame({"url": corpus_ids})

    results = []
    for i in range(10_000):
        recall = (
            len(
                set(
                    corpus_df.head(i).select("url").to_numpy().flatten().tolist()
                ).intersection(set(positive_ids))
            )
            / n_cps
        )
        results.append(recall)

    return results

In [None]:
def simulate_asreview(topic_df: pl.DataFrame) -> list[float]:
    train_model = NaiveBayesClassifier()
    query_model = MaxQuery()
    balance_model = DoubleBalance()
    feature_model = Tfidf()

    temp_dir = Path("tmp_data")
    temp_dir.mkdir(exist_ok=True)

    name = str(uuid4())
    project_path = Path(temp_dir, name)
    data_path = Path(project_path, "api_simulation", "data", "topic_data.csv")
    results_path = Path(project_path, "results.asreview")
    project = ASReviewProject.create(
        project_path=project_path / "api_simulation",
        project_id=name,
        project_mode="simulate",
        project_name=name,
    )
    topic_df.select("url", "title", "abstract", "label").write_csv(data_path)
    data_obj = ASReviewData.from_file(data_path)
    reviewer = ReviewSimulate(
        as_data=data_obj,
        model=train_model,
        query_model=query_model,
        balance_model=balance_model,
        feature_model=feature_model,
        n_instances=10,
        project=project,
        n_prior_included=1,
        n_prior_excluded=1,
    )
    reviewer.review()
    project.mark_review_finished()
    project.export(results_path)
    with open_state(results_path) as state_obj:
        recalls = [recall(state_obj, i) for i in tqdm(np.linspace(0, 1, 10000), leave=False)]
    
    return recalls

In [None]:
def run():
    df = get_data(data)
    results = {
        # "Baseline": [],
        # "Avg. Vector": [],
        # "Asreview": [],
        # "Multilayer Perceptron": [],
        "SVM": [],
    }
    method_names = {
        "mlp": "Multilayer Perceptron",
        "avg-vector": "Avg. Vector",
        "svm": "SVM",
    }
    for topic, topic_df, corpus, positive_pubs, negative_pubs in parse_data(df):
        # baseline_result = simulate_baseline(positive_pubs, corpus)
        # asr_results = simulate_asreview(topic_df)
        # results["Baseline"].append(baseline_result)
        # results["Asreview"].append(asr_results)
        
        for method in ["svm"]:
            result = simulate(
                positive_pubs,
                negative_pubs,
                corpus,
                topic,
                predicition_method=method,
                verbose=True,
            )
            results[method_names[method]].append(result)

    return results

from utils import save_data
results = run()
aggergated_results = {k: np.mean(v, axis=0) for k, v in results.items()}
plot_df = pl.DataFrame(aggergated_results).with_row_index("Percent Reviewed").to_pandas()
save_data(plot_df, "active_learnning_svm")

In [None]:
def simulate_under_5k(files: list[str]):
    results = defaultdict(list)
    for file in tqdm(files):
        data = pl.read_parquet(file)
        df = get_data(data, new=False)
        topics = {}
        for topic in TOPICS_UNDER_5K:
            temp = data.filter(pl.col("topic") == topic)
            topics[topic] = {
                "id": temp.select("id").to_numpy().flatten().tolist(),
                "score": list(range(temp.shape[0]))[::-1]
            }

        file_results = defaultdict(list)
        for i, (topic, topic_df, corpus, positive_pubs, negative_pubs) in enumerate(
            parse_data(df, topics)
        ):
            top_n = TOPICS_UNDER_5K[topic]
            print(f"{topic}: {top_n}")
            result = simulate(
                # topic_df,
                positive_pubs,
                negative_pubs,
                corpus,
                topic,
                predicition_method="svm",   
                verbose=False,
                max_iterations=top_n,
            )[:top_n]
            file_results[topic].extend(result)
            print("\n\n")
        for k, v in file_results.items():
            results[k].append(max(v))
    return results

# from utils import save_data
files = glob("ablation/final/*.parquet")
results = simulate_under_5k(files)
averaged_results = {k:np.mean(v).item() for k,v in results.items()}
averaged_results = pl.DataFrame(averaged_results).to_pandas() 
# save_data(averaged_results, "activate_learning_under_5k_asreview")

In [None]:
df = pl.read_csv("PaperSeek-Report/data/active_learning_under_5k.csv")
clms = df.columns[1:]
df.with_columns(mean=pl.mean_horizontal(clms))