In [1]:
# Cell 1

import sqlite3
import json
import pandas as pd

# Connect to the SQLite database
db_path = "../db/research_papers.db"  # <-- Adjust to your actual path
conn = sqlite3.connect(db_path)

# Load table into a Pandas DataFrame
df = pd.read_sql_query("SELECT * FROM labelled_data", conn)

# Close the connection
conn.close()

df.head()

Unnamed: 0,id,file_name,publishable,conference,sections
0,3808_The_Distortion_of_Binomia,3808_The_Distortion_of_Binomia.pdf,1,NeurIPS,"{""output"": ""The Distortion of Binomial Voting ..."
1,461_LithoBench_Benchmarking_AI,461_LithoBench_Benchmarking_AI.pdf,1,NeurIPS,"{""output"": ""LithoBench: Benchmarking AI Comput..."
2,9310_Multi_task_learning_with_,9310_Multi_task_learning_with_.pdf,1,NeurIPS,"{""output"": ""Multi-Task Learning with Summary S..."
3,557_EmbodiedGPT_Vision_Languag,557_EmbodiedGPT_Vision_Languag.pdf,1,NeurIPS,"{""output"": ""EmbodiedGPT: Vision-Language Pre-T..."
4,10107_Finite_Population_Regres,10107_Finite_Population_Regres.pdf,1,NeurIPS,"{""output"": ""Finite Population Regression Adjus..."


In [2]:
# Cell 2

def extract_text_from_json(json_str):
    """Extract the 'output' field from the JSON string."""
    try:
        data = json.loads(json_str)
        return data.get('output', '')
    except (json.JSONDecodeError, TypeError):
        return ''

# Apply extraction
df["text"] = df["sections"].apply(extract_text_from_json)

# Optional cleanup for Unicode errors (if needed):
import re
df["text"] = df["text"].fillna("")
df["text"] = df["text"].apply(lambda x: re.sub(r'[\ud800-\udfff]', '', x))

df[["sections", "text"]].head()


Unnamed: 0,sections,text
0,"{""output"": ""The Distortion of Binomial Voting ...",The Distortion of Binomial Voting Defies Expec...
1,"{""output"": ""LithoBench: Benchmarking AI Comput...",LithoBench: Benchmarking AI Computational\nLit...
2,"{""output"": ""Multi-Task Learning with Summary S...",Multi-Task Learning with Summary Statistics\nP...
3,"{""output"": ""EmbodiedGPT: Vision-Language Pre-T...",EmbodiedGPT: Vision-Language Pre-Training via\...
4,"{""output"": ""Finite Population Regression Adjus...",Finite Population Regression Adjustment and\nN...


In [3]:
# Cell 3

from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["publishable"]  # optional if your classes are imbalanced
)

print("Train set size:", len(train_df))
print("Validation set size:", len(valid_df))


Train set size: 245
Validation set size: 62


In [None]:
# Cell 4

from transformers import AutoTokenizer

model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def chunk_text(text, tokenizer, max_tokens=512, overlap=50):
    """
    Splits the text into overlapping chunks of up to `max_tokens` tokens.
    Returns a list of raw text chunks.
    """
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0
    
    while start < len(tokens):
        end = start + max_tokens
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
        
        start += (max_tokens - overlap)
        if start >= len(tokens):
            break
    
    return chunks

def chunk_dataset(df, tokenizer, max_tokens=512, overlap=50):
    """
    Convert each row in `df` into multiple chunks (if text > max_tokens).
    Duplicate the label for each chunk.
    """
    new_texts = []
    new_labels = []
    
    for _, row in df.iterrows():
        text = row["text"]
        label = row["publishable"]
        
        # Skip if text is NaN or empty
        if not isinstance(text, str) or not text.strip():
            continue
        
        chunks = chunk_text(text, tokenizer, max_tokens=max_tokens, overlap=overlap)
        
        # Duplicate the label for every chunk of the same document
        new_texts.extend(chunks)
        new_labels.extend([label] * len(chunks))
    
    return pd.DataFrame({"text": new_texts, "publishable": new_labels})

train_df_chunked = chunk_dataset(train_df, tokenizer, max_tokens=512, overlap=50)
valid_df_chunked = chunk_dataset(valid_df, tokenizer, max_tokens=512, overlap=50)

print("Chunked train size:", len(train_df_chunked))
print("Chunked valid size:", len(valid_df_chunked))
train_df_chunked.head()


Chunked train size: 8284
Chunked valid size: 2046


Unnamed: 0,text,publishable
0,proceedings of the 2023 conference on empirica...,1
1,"candidates presented in fig. 1, a vwsd frame -...",1
2,and image - to - image ) and question - answer...,1
3,as enriching the short phrases of the vwsd dat...,1
4,"( pre - trained on coco ( lin et al., 2015 ) a...",1


In [5]:
# Cell 5

from datasets import Dataset

# If you have Unicode issues, ensure you clean `train_df_chunked["text"]` and `valid_df_chunked["text"]`

train_df_chunked.rename(columns={"publishable": "labels"}, inplace=True)
valid_df_chunked.rename(columns={"publishable": "labels"}, inplace=True)

train_dataset = Dataset.from_pandas(train_df_chunked, preserve_index=False)
valid_dataset = Dataset.from_pandas(valid_df_chunked, preserve_index=False)

train_dataset, valid_dataset


(Dataset({
     features: ['text', 'labels'],
     num_rows: 8284
 }),
 Dataset({
     features: ['text', 'labels'],
     num_rows: 2046
 }))

In [6]:
# Cell 6

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "labels"]
)
valid_dataset.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "labels"]
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/8284 [00:00<?, ? examples/s]

Map:   0%|          | 0/2046 [00:00<?, ? examples/s]

In [7]:
# Cell 7

import torch
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2  # binary classification
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
# Cell 8

from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

training_args = TrainingArguments(
    output_dir="./scibert_finetuned_chunks",  # directory to store model checkpoints
    evaluation_strategy="epoch",             # evaluate at the end of each epoch
    save_strategy="epoch",                   # save a checkpoint every epoch
    per_device_train_batch_size=4,           # adjust based on GPU memory
    per_device_eval_batch_size=4,
    num_train_epochs=3,                      # feel free to increase
    learning_rate=2e-5,                      # typical fine-tuning LR
    logging_dir="./logs",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

trainer




<transformers.trainer.Trainer at 0x15b838990>

In [9]:
# Cell 9

trainer.train()


  0%|          | 0/6213 [00:00<?, ?it/s]

{'loss': 0.1394, 'grad_norm': 0.004926782101392746, 'learning_rate': 1.8390471591823598e-05, 'epoch': 0.24}
{'loss': 0.066, 'grad_norm': 0.00410446198657155, 'learning_rate': 1.6780943183647193e-05, 'epoch': 0.48}
{'loss': 0.0509, 'grad_norm': 0.002024067332968116, 'learning_rate': 1.5171414775470788e-05, 'epoch': 0.72}
{'loss': 0.0497, 'grad_norm': 0.0037560127675533295, 'learning_rate': 1.3561886367294384e-05, 'epoch': 0.97}


  0%|          | 0/512 [00:00<?, ?it/s]

{'eval_loss': 0.057358015328645706, 'eval_accuracy': 0.9897360703812317, 'eval_precision': 0.9973944762897342, 'eval_recall': 0.9917098445595854, 'eval_f1': 0.9945440374123149, 'eval_runtime': 95.4047, 'eval_samples_per_second': 21.445, 'eval_steps_per_second': 5.367, 'epoch': 1.0}
{'loss': 0.0188, 'grad_norm': 0.0035171944182366133, 'learning_rate': 1.195235795911798e-05, 'epoch': 1.21}
{'loss': 0.0126, 'grad_norm': 0.004475266672670841, 'learning_rate': 1.0342829550941576e-05, 'epoch': 1.45}
{'loss': 0.0184, 'grad_norm': 0.0004950168658979237, 'learning_rate': 8.73330114276517e-06, 'epoch': 1.69}
{'loss': 0.0007, 'grad_norm': 0.0005273022106848657, 'learning_rate': 7.1237727345887665e-06, 'epoch': 1.93}


  0%|          | 0/512 [00:00<?, ?it/s]

{'eval_loss': 0.07458073645830154, 'eval_accuracy': 0.9882697947214076, 'eval_precision': 0.9922520661157025, 'eval_recall': 0.9953367875647668, 'eval_f1': 0.9937920331091568, 'eval_runtime': 94.935, 'eval_samples_per_second': 21.552, 'eval_steps_per_second': 5.393, 'epoch': 2.0}
{'loss': 0.0077, 'grad_norm': 0.0002459415409248322, 'learning_rate': 5.514244326412362e-06, 'epoch': 2.17}
{'loss': 0.0029, 'grad_norm': 0.00020651030354201794, 'learning_rate': 3.9047159182359575e-06, 'epoch': 2.41}
{'loss': 0.0002, 'grad_norm': 0.00029924773843958974, 'learning_rate': 2.2951875100595526e-06, 'epoch': 2.66}
{'loss': 0.0014, 'grad_norm': 0.00030411899206228554, 'learning_rate': 6.856591018831483e-07, 'epoch': 2.9}


  0%|          | 0/512 [00:00<?, ?it/s]

{'eval_loss': 0.08893232047557831, 'eval_accuracy': 0.989247311827957, 'eval_precision': 0.9963579604578564, 'eval_recall': 0.9922279792746114, 'eval_f1': 0.994288681204569, 'eval_runtime': 85.7385, 'eval_samples_per_second': 23.863, 'eval_steps_per_second': 5.972, 'epoch': 3.0}
{'train_runtime': 4250.8224, 'train_samples_per_second': 5.846, 'train_steps_per_second': 1.462, 'train_loss': 0.02968739089837099, 'epoch': 3.0}


TrainOutput(global_step=6213, training_loss=0.02968739089837099, metrics={'train_runtime': 4250.8224, 'train_samples_per_second': 5.846, 'train_steps_per_second': 1.462, 'total_flos': 6538835947806720.0, 'train_loss': 0.02968739089837099, 'epoch': 3.0})

In [10]:
# Cell 10

eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)


  0%|          | 0/512 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.057358015328645706, 'eval_accuracy': 0.9897360703812317, 'eval_precision': 0.9973944762897342, 'eval_recall': 0.9917098445595854, 'eval_f1': 0.9945440374123149, 'eval_runtime': 87.3384, 'eval_samples_per_second': 23.426, 'eval_steps_per_second': 5.862, 'epoch': 3.0}


In [11]:
# Detailed classification metrics
from sklearn.metrics import confusion_matrix, classification_report

predictions = trainer.predict(valid_dataset)
y_preds = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_preds))

print("\nClassification Report:")
print(classification_report(y_true, y_preds, digits=3))

  0%|          | 0/512 [00:00<?, ?it/s]


Confusion Matrix:
[[ 111    5]
 [  16 1914]]

Classification Report:
              precision    recall  f1-score   support

           0      0.874     0.957     0.914       116
           1      0.997     0.992     0.995      1930

    accuracy                          0.990      2046
   macro avg      0.936     0.974     0.954      2046
weighted avg      0.990     0.990     0.990      2046



In [15]:
# Cell 11: Doc-level evaluation with aggregator

import torch
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

def classify_with_aggregation(text, tokenizer, model, max_tokens=512, overlap=50, aggregation="mean"):
    # chunk_text function is assumed to exist already
    chunks = chunk_text(text, tokenizer, max_tokens=max_tokens, overlap=overlap)

    # Make sure the model is on the correct device
    device = torch.device("mps")  # or detect automatically as shown above
    model.to(device)
    model.eval()

    all_logits = []
    with torch.no_grad():
        for chunk in chunks:
            inputs = tokenizer(
                chunk,
                return_tensors='pt',
                truncation=True,
                padding='max_length',
                max_length=max_tokens
            )
            # Move inputs to MPS (or CPU)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            outputs = model(**inputs)
            logits = outputs.logits[0].cpu().numpy()
            all_logits.append(logits)

    all_logits = np.array(all_logits)  # shape: (#chunks, 2)
    if aggregation == "mean":
        agg_logits = np.mean(all_logits, axis=0)
    else:  # "max"
        agg_logits = np.max(all_logits, axis=0)
    
    return int(np.argmax(agg_logits))

# Evaluate doc-level predictions on the original valid_df
doc_preds = []
doc_labels = []

for i, row in valid_df.iterrows():
    if isinstance(row["text"], str) and row["text"].strip():
        pred_label = classify_with_aggregation(
            row["text"], 
            tokenizer, 
            model, 
            max_tokens=512, 
            overlap=50, 
            aggregation="mean"  # or "max"
        )
        doc_preds.append(pred_label)
        doc_labels.append(row["publishable"])
    else:
        # For empty/invalid text, skip or assign None
        doc_preds.append(None)
        doc_labels.append(row["publishable"])

# Filter out any None predictions
pairs = [(p, l) for p, l in zip(doc_preds, doc_labels) if p is not None]
preds_filtered, labels_filtered = zip(*pairs)

acc = accuracy_score(labels_filtered, preds_filtered)
prec = precision_score(labels_filtered, preds_filtered)
rec = recall_score(labels_filtered, preds_filtered)
f1 = f1_score(labels_filtered, preds_filtered)

print(f"Doc-Level Aggregation Results (Mean):")
print(f"Accuracy:   {acc:.2f}")
print(f"Precision:  {prec:.2f}")
print(f"Recall:     {rec:.2f}")
print(f"F1 Score:   {f1:.2f}\n")

# Confusion matrix + classification report
cm = confusion_matrix(labels_filtered, preds_filtered)
cr = classification_report(labels_filtered, preds_filtered, digits=3)

print("Confusion Matrix:")
print(cm)
print("\nClassification Report:")
print(cr)


Doc-Level Aggregation Results (Mean):
Accuracy:   1.00
Precision:  1.00
Recall:     1.00
F1 Score:   1.00

Confusion Matrix:
[[ 9  0]
 [ 0 53]]

Classification Report:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000         9
           1      1.000     1.000     1.000        53

    accuracy                          1.000        62
   macro avg      1.000     1.000     1.000        62
weighted avg      1.000     1.000     1.000        62



In [None]:
# Database connection
db_path = "../db/research_papers.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create results table if it doesn't exist
create_table_sql = """
CREATE TABLE IF NOT EXISTS results (
    paper_id TEXT PRIMARY KEY,
    publishable INTEGER,
    conference TEXT,
    rationale TEXT
)
"""
cursor.execute(create_table_sql)
conn.commit()

# Load unlabelled data
unlabelled_df = pd.read_sql_query("SELECT * FROM unlabelled_data", conn)

# Process each document and collect results
results = []
for _, row in unlabelled_df.iterrows():
    text = extract_text_from_json(row["sections"])
    if isinstance(text, str) and text.strip():
        pred_label = classify_with_aggregation(
            text,
            tokenizer,
            model,
            max_tokens=512,
            overlap=50,
            aggregation="mean"
        )
        results.append({
            "paper_id": (row["file_name"]).replace(".pdf", ""),
            "publishable": int(pred_label),
            "conference": "",
            "rationale": ""
        })

# Create DataFrame with results
results_df = pd.DataFrame(results)

try:
    # Clear existing results if any
    cursor.execute("DELETE FROM results")
    
    # Insert new results using SQL directly
    for _, row in results_df.iterrows():
        cursor.execute("""
            INSERT INTO results (paper_id, publishable, conference, rationale)
            VALUES (?, ?, ?, ?)
        """, (row['paper_id'], row['publishable'], row['conference'], row['rationale']))
    
    conn.commit()
    print(f"Successfully processed and stored {len(results)} papers")
    
except sqlite3.Error as e:
    print(f"Database error: {e}")
    conn.rollback()
finally:
    cursor.close()
    conn.close()

# Display sample of results
print("\nSample of results:")
print(results_df.head())