In [1]:
import sqlite3
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np


In [2]:
# Connect to the SQLite database
db_path = '../db/research_papers.db'  # Adjust to your actual path
conn = sqlite3.connect(db_path)

# Load table into a Pandas DataFrame
df = pd.read_sql_query("SELECT * FROM labelled_data", conn)

# Close the connection
conn.close()

df.head()

Unnamed: 0,id,file_name,publishable,conference,sections
0,3808_The_Distortion_of_Binomia,3808_The_Distortion_of_Binomia.pdf,1,NeurIPS,"{""output"": ""The Distortion of Binomial Voting ..."
1,461_LithoBench_Benchmarking_AI,461_LithoBench_Benchmarking_AI.pdf,1,NeurIPS,"{""output"": ""LithoBench: Benchmarking AI Comput..."
2,9310_Multi_task_learning_with_,9310_Multi_task_learning_with_.pdf,1,NeurIPS,"{""output"": ""Multi-Task Learning with Summary S..."
3,557_EmbodiedGPT_Vision_Languag,557_EmbodiedGPT_Vision_Languag.pdf,1,NeurIPS,"{""output"": ""EmbodiedGPT: Vision-Language Pre-T..."
4,10107_Finite_Population_Regres,10107_Finite_Population_Regres.pdf,1,NeurIPS,"{""output"": ""Finite Population Regression Adjus..."


In [3]:
def extract_text_from_json(json_str):
    """Extract the 'output' field from the JSON string."""
    try:
        data = json.loads(json_str)
        return data.get('output', '')
    except (json.JSONDecodeError, TypeError):
        return ''

# Apply extraction
df['text'] = df['sections'].apply(extract_text_from_json)

df.head()

Unnamed: 0,id,file_name,publishable,conference,sections,text
0,3808_The_Distortion_of_Binomia,3808_The_Distortion_of_Binomia.pdf,1,NeurIPS,"{""output"": ""The Distortion of Binomial Voting ...",The Distortion of Binomial Voting Defies Expec...
1,461_LithoBench_Benchmarking_AI,461_LithoBench_Benchmarking_AI.pdf,1,NeurIPS,"{""output"": ""LithoBench: Benchmarking AI Comput...",LithoBench: Benchmarking AI Computational\nLit...
2,9310_Multi_task_learning_with_,9310_Multi_task_learning_with_.pdf,1,NeurIPS,"{""output"": ""Multi-Task Learning with Summary S...",Multi-Task Learning with Summary Statistics\nP...
3,557_EmbodiedGPT_Vision_Languag,557_EmbodiedGPT_Vision_Languag.pdf,1,NeurIPS,"{""output"": ""EmbodiedGPT: Vision-Language Pre-T...",EmbodiedGPT: Vision-Language Pre-Training via\...
4,10107_Finite_Population_Regres,10107_Finite_Population_Regres.pdf,1,NeurIPS,"{""output"": ""Finite Population Regression Adjus...",Finite Population Regression Adjustment and\nN...


In [4]:
# Load the tokenizer and model
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# For binary classification, we specify num_labels=2
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def chunk_text(text, tokenizer, max_tokens=512, overlap=50):
    """
    Splits the text into overlapping chunks of up to max_tokens (tokenized).
    Returns a list of raw text chunks.
    """
    # Tokenize the entire text
    tokens = tokenizer.tokenize(text)
    
    # We will store the text chunks here
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        # Slice the token list to get the chunk
        chunk_tokens = tokens[start:end]
        # Convert back to text
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
        # Move the start index by (max_tokens - overlap)
        start += (max_tokens - overlap)
        
        if start >= len(tokens):
            break
    return chunks


In [16]:
def classify_with_aggregation(text, tokenizer, model, max_tokens=512, overlap=120, aggregation='mean'):
    """
    Classify a text by splitting into chunks and aggregating the logits.
    aggregation: 'mean' or 'max'.
    Returns predicted label (0 or 1).
    """
    # Get all chunks
    chunks = chunk_text(text, tokenizer, max_tokens, overlap)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    all_logits = []
    model.eval()
    with torch.no_grad():
        for chunk in chunks:
            # Prepare model inputs
            inputs = tokenizer(
                chunk, 
                return_tensors="pt", 
                truncation=True,  # Truncate inputs longer than the max model length
                max_length=max_tokens,  # Ensure the length matches max_tokens
                padding="max_length"   # Add padding to meet max_tokens length
            )
            # Move inputs to device
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            logits = outputs.logits  # shape: [batch_size, num_labels], here batch_size=1
            all_logits.append(logits.cpu().numpy())

    # all_logits is a list of numpy arrays, each of shape (1, 2)
    all_logits = np.vstack(all_logits)  # shape => (#chunks, 2)

    if aggregation == 'mean':
        agg_logits = np.mean(all_logits, axis=0)  # shape => (2, )
    elif aggregation == 'max':
        agg_logits = np.max(all_logits, axis=0)
    else:
        raise ValueError("Aggregation must be 'mean' or 'max'.")

    # Predicted label is the index of the max logit
    predicted_label = int(np.argmax(agg_logits))
    return predicted_label


In [17]:
# Example usage on a sample document
sample_index = 0  # you can pick any valid index
sample_text = df.loc[sample_index, 'text']

pred_label = classify_with_aggregation(sample_text, tokenizer, model, max_tokens=512, overlap=50, aggregation='mean')
print(f"Predicted label for document at index {sample_index}: {pred_label}")


Predicted label for document at index 0: 1


In [19]:
predictions = []
skipped_indices = []  # To track rows that caused errors

for idx, row in df.iterrows():
    text_data = row['text']
    try:
        # Skip rows with empty or invalid text
        if not text_data or not isinstance(text_data, str) or text_data.strip() == "":
            raise ValueError("Invalid or empty text")
        
        # Classify the text
        label = classify_with_aggregation(text_data, tokenizer, model, max_tokens=512, overlap=50, aggregation='mean')
        predictions.append(label)
    except Exception as e:
        print(f"Skipping index {idx} due to error: {e}")
        predictions.append(None)  # Use None or a default value
        skipped_indices.append(idx)

# Add predictions to the DataFrame
df['predicted_label'] = predictions


Skipping index 46 due to error: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
Skipping index 60 due to error: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
Skipping index 79 due to error: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
Skipping index 104 due to error: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
Skipping index 109 due to error: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
Skipping index 136 due to error: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Drop rows where predictions are None to ensure valid comparisons
valid_df = df.dropna(subset=['predicted_label'])

# Ground truth and predictions
y_true = valid_df['publishable'].astype(int)  # Ensure integer type
y_pred = valid_df['predicted_label'].astype(int)

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Display results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred))


Accuracy: 0.85
Precision: 0.85
Recall: 1.00
F1 Score: 0.92

Confusion Matrix:
[[  0  44]
 [  0 257]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        44
           1       0.85      1.00      0.92       257

    accuracy                           0.85       301
   macro avg       0.43      0.50      0.46       301
weighted avg       0.73      0.85      0.79       301



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
