In [1]:
# Cell 1

# If not installed, you may need:
# !pip install pathway transformers torch scikit-learn

import sqlite3
import json
import torch
import pathway as pw

# Import VectorStoreServer + VectorStoreClient from Pathway
from pathway.xpacks.llm.vector_store import VectorStoreServer, VectorStoreClient

from transformers import AutoTokenizer, AutoModel

DB_PATH = "db/research_papers.db"  # Adjust if needed

# Conferences of interest
TARGET_CONFERENCES = ["CVPR", "NeurIPS", "EMNLP", "TMLR", "KDD"]


In [2]:
# Cell 2

import sqlite3
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def scibert_embedder(doc_text: str) -> list[float]:
    """
    Pathway-compatible embedder: takes a string and returns a list of floats.
    We'll do mean pooling over the last_hidden_state.
    """
    inputs = tokenizer(doc_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling across the sequence dimension
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
    return embedding

def fetch_reference_papers_df():
    """
    Fetch from the same labelled_data table:
      1) publishable = 1
      2) conference in ("NeurIPS","KDD","TMLR","EMNLP","CVPR")
    Return as a Pandas DataFrame with columns: [id, conference, sections].
    """
    conf_list = ["NeurIPS", "KDD", "TMLR", "EMNLP", "CVPR"]
    placeholders = ", ".join(["?"] * len(conf_list))

    query = f"""
        SELECT id, conference, sections
        FROM labelled_data
        WHERE publishable = 1
          AND conference IN ({placeholders})
    """

    conn = sqlite3.connect("../db/research_papers.db")  # Adjust if needed
    df_ref = pd.read_sql_query(query, conn, params=conf_list)
    conn.close()
    return df_ref

df_references = fetch_reference_papers_df()
print(f"Fetched {len(df_references)} references as a DataFrame.")
df_references.head()


Fetched 263 references as a DataFrame.


Unnamed: 0,id,conference,sections
0,3808_The_Distortion_of_Binomia,NeurIPS,"{""output"": ""The Distortion of Binomial Voting ..."
1,461_LithoBench_Benchmarking_AI,NeurIPS,"{""output"": ""LithoBench: Benchmarking AI Comput..."
2,9310_Multi_task_learning_with_,NeurIPS,"{""output"": ""Multi-Task Learning with Summary S..."
3,557_EmbodiedGPT_Vision_Languag,NeurIPS,"{""output"": ""EmbodiedGPT: Vision-Language Pre-T..."
4,10107_Finite_Population_Regres,NeurIPS,"{""output"": ""Finite Population Regression Adjus..."


In [3]:
# Cell 3 (Updated)

import json
import pandas as pd
import pathway as pw
from pathway.xpacks.llm.vector_store import VectorStoreServer

def create_reference_pathway_table_from_df(df_ref: pd.DataFrame) -> pw.Table:
    """
    Transform our reference DataFrame into a Pathway table 
    with columns named [data, _metadata] (required by VectorStoreServer).
    Each row's `data` is the "output" field from your JSON,
    `_metadata` stores {paper_id, conference_label}.
    """
    rows_list = []

    for _, row in df_ref.iterrows():
        paper_id = row["id"]
        conf_label = row["conference"]
        sections_json = row["sections"]

        try:
            # Parse JSON
            sections_dict = json.loads(sections_json) if sections_json else {}
            # Extract the "output" field
            text_str = sections_dict.get("output", "")
            
            # Build metadata
            metadata_dict = {
                "paper_id": paper_id,
                "conference_label": conf_label
            }
            
            # The server expects columns named 'data' and '_metadata'
            rows_list.append((text_str, metadata_dict))
        except Exception as e:
            print(f"Skipping paper {paper_id} due to error: {e}")

    # Create a Pandas DataFrame with columns ["data", "_metadata"]
    df_pathway = pd.DataFrame(rows_list, columns=["data", "_metadata"])
    
    # Convert to a Pathway table
    table = pw.debug.table_from_pandas(df_pathway)
    return table

# Build the Pathway table
reference_table = create_reference_pathway_table_from_df(df_references)

def no_op_parser(contents):
    """
    A custom parser that returns a list of (text, metadata) 
    but does no actual parsing, because we already have text.
    """
    if isinstance(contents, str):
        # Already a string, just wrap in a list[tuple[str, dict]]
        return [(contents, {})]
    else:
        # If it arrives in bytes, decode as UTF-8
        return [(contents.decode("utf-8", errors="replace"), {})]


# Now create the VectorStoreServer using the correct columns
server = VectorStoreServer(
    reference_table,        # pass as a positional argument
    embedder=scibert_embedder,
    parser=no_op_parser,            # We already have 'data' in the doc rows
    splitter=None,
    doc_post_processors=None
)

server.run_server(host="0.0.0.0", port=8000, threaded=True, with_cache=True)
print("VectorStoreServer is running at http://0.0.0.0:8000")


    https://beartype.readthedocs.io/en/latest/api_roar/#pep-585-deprecations
  warn(


VectorStoreServer is running at http://0.0.0.0:8000


(Press CTRL+C to quit)


In [5]:
# Cell 4

def fetch_publishable_papers():
    """
    Load only the 'publishable' papers (publishable=1) from labelled_data.
    """
    conn = sqlite3.connect("../db/research_papers.db")
    cur = conn.cursor()
    cur.execute("SELECT id, file_name, sections FROM labelled_data WHERE publishable = 1")
    rows = cur.fetchall()
    conn.close()
    return rows

client = VectorStoreClient(host="0.0.0.0", port=8000, timeout=30)

def classify_publishable_paper(paper_text: str, k=3) -> (str, str):
    """
    Queries the server with the full paper text, gets top-k matches,
    picks the best conference by majority or top distance,
    and returns (conference_label, justification).
    """
    # Query the vector store
    results = client.query(query=paper_text, k=k)
    # 'results' is a list of dicts with:
    #  - 'text': the reference text
    #  - 'metadata': the original metadata we stored
    #  - 'score': the similarity/distance

    conference_counts = {}
    for r in results:
        meta = r["metadata"]   # {'paper_id': ..., 'conference_label': ...}
        conf_label = meta["conference_label"]
        conference_counts[conf_label] = conference_counts.get(conf_label, 0) + 1

    # Simple majority
    best_conference = None
    best_count = 0
    for conf, count in conference_counts.items():
        if count > best_count:
            best_conference = conf
            best_count = count

    # Provide a short justification (<=100 words)
    justification = (
        f"This paper shows strong similarity to {best_conference} reference documents. "
        f"The methods and findings appear aligned with the focus of {best_conference}, "
        f"making it a suitable conference choice."
    )

    # Truncate to ~100 words if necessary
    words = justification.split()
    if len(words) > 100:
        justification = " ".join(words[:100])
    
    return best_conference, justification

def classify_all_publishable_papers():
    results = []
    publishable_papers = fetch_publishable_papers()
    for (paper_id, file_name, sections_json) in publishable_papers:
        try:
            sections = json.loads(sections_json)
            paper_text = " ".join(sections.values()) if isinstance(sections, dict) else ""
            if not paper_text.strip():
                continue
            conf_label, justification = classify_publishable_paper(paper_text, k=3)
            results.append((paper_id, file_name, conf_label, justification))
        except Exception as e:
            print(f"Skipping paper {paper_id} due to error: {e}")
    return results

classified_publishable = classify_all_publishable_papers()
print(f"Classified {len(classified_publishable)} 'publishable' papers.")


ERROR:pathway_engine.connectors:Parse error: cannot create a field "query" with type str from value Does Graph Distillation See Like Vision Dataset
Counterpart?
Beining Yang1,2∗ ∗, Kai Wang 3∗, Qingyun Sun 1,2† †, Cheng Ji 1,2, Xingcheng Fu 1,2,
Hao Tang4, Yang You3, Jianxin Li1,2‡ ‡
1School of Computer Science and Engineering, Beihang University
2Advanced Innovation Center for Big Data and Brain Computing, Beihang University
3National University of Singapore 4Carnegie Mellon University
Abstract
Training on large-scale graphs has achieved remarkable results in graph representa-
tion learnin...


Skipping paper 1279_Does_Graph_Distillation_S due to error: HTTPConnectionPool(host='0.0.0.0', port=8000): Read timed out. (read timeout=30)


ERROR:pathway_engine.connectors:Parse error: cannot create a field "query" with type str from value Low-Light Image Enhancement via Structure Modeling and Guidance
Xiaogang Xu1 Ruixing Wang2 Jiangbo Lu3∗
1 Zhejiang Lab 2 Honor Device Co., Ltd. 3 SmartMore Corporation
xgxu@zhejianglab.com, ruixingw@hustunique.com, jiangbo@smartmore.com
Abstract
This paper proposes a new framework for low-light im-
age enhancement by simultaneously conducting the appear-
ance as well as structure modeling. It employs the struc-
tural feature to guide the appearance enhancement, lead-
ing to sharp and realisti...


Skipping paper Xu_Low-Light_Image_Enhancement_via_Structure_Modeling_and_Guidance_CVPR_2023_paper due to error: HTTPConnectionPool(host='0.0.0.0', port=8000): Read timed out. (read timeout=30)


ERROR:pathway_engine.connectors:Parse error: cannot create a field "query" with type str from value 1000 FPS HDR Video with a Spike-RGB Hybrid Camera
Yakun Chang1,2 Chu Zhou3 Yuchen Hong1,2 Liwen Hu2 Chao Xu3 Tiejun Huang1,2 Boxin Shi1,2*
1 National Key Laboratory for Multimedia Information Processing, School of Computer Science, Peking University
2 National Engineering Research Center of Visual Technology, School of Computer Science, Peking University
3 National Key Laboratory of General AI, School of Intelligence Science and Technology, Peking University
{yakunchang, zhou chu, huliwen, tj...


Skipping paper Chang_1000_FPS_HDR_Video_With_a_Spike-RGB_Hybrid_Camera_CVPR_2023_paper due to error: HTTPConnectionPool(host='0.0.0.0', port=8000): Read timed out. (read timeout=30)


ERROR:pathway_engine.connectors:Parse error: cannot create a field "query" with type str from value DPF: Learning Dense Prediction Fields with Weak Supervision
Xiaoxue Chen1, Yuhang Zheng2, Yupeng Zheng3
Qiang Zhou1, Hao Zhao1, Guyue Zhou1, Ya-Qin Zhang1
1AIR, Tsinghua University 2BUAA 3CASIA
{chenxiaoxue, zhaohao}@air.tsinghua.edu.cn, zyh 021@buaa.edu.cn
Abstract
Nowadays, many visual scene understanding problems
are addressed by dense prediction networks. But pixel-wise
dense annotations are very expensive (e.g., for scene pars-
ing) or impossible (e.g., for intrinsic image decomposition)...


Skipping paper Chen_DPF_Learning_Dense_Prediction_Fields_With_Weak_Supervision_CVPR_2023_paper due to error: HTTPConnectionPool(host='0.0.0.0', port=8000): Read timed out. (read timeout=30)


ERROR:pathway_engine.connectors:Parse error: cannot create a field "query" with type str from value Autoregressive Visual Tracking
Xing Wei† Yifan Bai† Yongchao Zheng† Dahu Shi‡§ Yihong Gong† 
†Xi’an Jiaotong University ‡Zhejiang University §Hikvision Research Institute
{weixing, ygong}@mail.xjtu.edu.cn {yfbai, zyc}@stu.xjtu.edu.cn shidahu@zju.edu.cn
Abstract
We presentARTrack, an autoregressive framework for
visual object tracking. ARTrack tackles tracking as a co-
ordinate sequence interpretation task that estimates object
trajectories progressively, where the current estimate is in-
duce...


Skipping paper Wei_Autoregressive_Visual_Tracking_CVPR_2023_paper due to error: HTTPConnectionPool(host='0.0.0.0', port=8000): Read timed out. (read timeout=30)


ERROR:pathway_engine.connectors:Parse error: cannot create a field "query" with type str from value Multiplex Heterogeneous Graph Neural Network with Behavior
Pattern Modeling
Chaofan Fu
Ocean University of China
Qingdao, China
fuchaofan@stu.ouc.edu.cn
Guanjie Zheng
Shanghai Jiao Tong University
Shanghai, China
gjzheng@sjtu.edu.cn
Chao Huang
The University of Hong Kong
Hong Kong, China
chaohuang75@gmail.com
Yanwei Yu∗
Ocean University of China
Qingdao, China
yuyanwei@ouc.edu.cn
Junyu Dong
Ocean University of China
Qingdao, China
dongjunyu@ouc.edu.cn
ABSTRACT
Heterogeneous graph neural netwo...


Skipping paper 3580305.3599441 due to error: HTTPConnectionPool(host='0.0.0.0', port=8000): Read timed out. (read timeout=30)
Classified 257 'publishable' papers.


In [6]:
# Cell 5

for (paper_id, file_name, conf_label, justification) in classified_publishable[:5]:
    print("="*50)
    print(f"Paper ID: {paper_id}, File: {file_name}")
    print(f"Recommended Conference: {conf_label}")
    print(f"Justification: {justification}\n")

# (Optional) Write to a new DB table
def store_classification_results(results):
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("""
        CREATE TABLE IF NOT EXISTS recommended_conferences (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            paper_id INTEGER,
            file_name TEXT,
            recommended_conf TEXT,
            justification TEXT
        )
    """)
    for paper_id, file_name, conf_label, justification in results:
        cur.execute("""
            INSERT INTO recommended_conferences (paper_id, file_name, recommended_conf, justification)
            VALUES (?, ?, ?, ?)
        """, (paper_id, file_name, conf_label, justification))
    conn.commit()
    conn.close()

# Uncomment to store the classification in DB:
# store_classification_results(classified_publishable)


Paper ID: 3808_The_Distortion_of_Binomia, File: 3808_The_Distortion_of_Binomia.pdf
Recommended Conference: NeurIPS
Justification: This paper shows strong similarity to NeurIPS reference documents. The methods and findings appear aligned with the focus of NeurIPS, making it a suitable conference choice.

Paper ID: 461_LithoBench_Benchmarking_AI, File: 461_LithoBench_Benchmarking_AI.pdf
Recommended Conference: CVPR
Justification: This paper shows strong similarity to CVPR reference documents. The methods and findings appear aligned with the focus of CVPR, making it a suitable conference choice.

Paper ID: 9310_Multi_task_learning_with_, File: 9310_Multi_task_learning_with_.pdf
Recommended Conference: NeurIPS
Justification: This paper shows strong similarity to NeurIPS reference documents. The methods and findings appear aligned with the focus of NeurIPS, making it a suitable conference choice.

Paper ID: 557_EmbodiedGPT_Vision_Languag, File: 557_EmbodiedGPT_Vision_Languag.pdf
Recommended

In [7]:
# Cell 6: Evaluate classification accuracy on publishable papers
import sqlite3
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

# Suppose you used the same DB_PATH as before
DB_PATH = "../db/research_papers.db"

# 1) Convert your classification results list into a DataFrame
# classified_publishable is of the form: [(paper_id, file_name, conf_label, justification), ...]
df_pred = pd.DataFrame(
    classified_publishable, 
    columns=["paper_id", "file_name", "predicted_conf", "justification"]
)

# 2) Fetch true conference labels for these papers
conn = sqlite3.connect(DB_PATH)
df_gold = pd.read_sql_query(
    """
    SELECT id AS paper_id, conference AS true_conf
    FROM labelled_data
    WHERE publishable = 1
      AND conference IN ("NeurIPS","KDD","TMLR","EMNLP","CVPR")
    """,
    conn
)
conn.close()

# 3) Merge the predicted DataFrame (df_pred) with df_gold on paper_id
df_merged = pd.merge(df_pred, df_gold, on="paper_id", how="inner")

# 4) Compute accuracy
df_merged["correct"] = df_merged["predicted_conf"] == df_merged["true_conf"]
accuracy = df_merged["correct"].mean()  # True = 1.0, so mean is accuracy

print(f"Total Papers Evaluated: {len(df_merged)}")
print(f"Accuracy: {accuracy*100:.2f}%")

# 5) Print confusion matrix & classification report
TARGET_CONFERENCES = ["NeurIPS","KDD","TMLR","EMNLP","CVPR"]

y_true = df_merged["true_conf"]
y_pred = df_merged["predicted_conf"]

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred, labels=TARGET_CONFERENCES))

print("\nClassification Report:")
print(classification_report(y_true, y_pred, labels=TARGET_CONFERENCES))


Total Papers Evaluated: 257
Accuracy: 93.39%

Confusion Matrix:
[[40  0  4  2  4]
 [ 0 44  1  2  0]
 [ 2  2 49  0  0]
 [ 0  0  0 48  0]
 [ 0  0  0  0 59]]

Classification Report:
              precision    recall  f1-score   support

     NeurIPS       0.95      0.80      0.87        50
         KDD       0.96      0.94      0.95        47
        TMLR       0.91      0.92      0.92        53
       EMNLP       0.92      1.00      0.96        48
        CVPR       0.94      1.00      0.97        59

    accuracy                           0.93       257
   macro avg       0.94      0.93      0.93       257
weighted avg       0.93      0.93      0.93       257

