In [1]:
import sqlite3
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Connect to the SQLite database
db_path = "../db/research_papers.db"  # <-- Adjust to your actual path
conn = sqlite3.connect(db_path)

# Load table into a Pandas DataFrame
df = pd.read_sql_query("SELECT * FROM labelled_data", conn)

# Close the connection
conn.close()

# Quick check
df.head()


Unnamed: 0,id,file_name,publishable,conference,sections
0,3808_The_Distortion_of_Binomia,3808_The_Distortion_of_Binomia.pdf,1,NeurIPS,"{""output"": ""The Distortion of Binomial Voting ..."
1,461_LithoBench_Benchmarking_AI,461_LithoBench_Benchmarking_AI.pdf,1,NeurIPS,"{""output"": ""LithoBench: Benchmarking AI Comput..."
2,9310_Multi_task_learning_with_,9310_Multi_task_learning_with_.pdf,1,NeurIPS,"{""output"": ""Multi-Task Learning with Summary S..."
3,557_EmbodiedGPT_Vision_Languag,557_EmbodiedGPT_Vision_Languag.pdf,1,NeurIPS,"{""output"": ""EmbodiedGPT: Vision-Language Pre-T..."
4,10107_Finite_Population_Regres,10107_Finite_Population_Regres.pdf,1,NeurIPS,"{""output"": ""Finite Population Regression Adjus..."


In [2]:
def extract_text_from_json(json_str):
    """Extract the 'output' field from the JSON string."""
    try:
        data = json.loads(json_str)
        return data.get('output', '')
    except (json.JSONDecodeError, TypeError):
        return ''

# Apply extraction
df['text'] = df['sections'].apply(extract_text_from_json)

# Check the newly created 'text' column
df[['sections', 'text']].head()


Unnamed: 0,sections,text
0,"{""output"": ""The Distortion of Binomial Voting ...",The Distortion of Binomial Voting Defies Expec...
1,"{""output"": ""LithoBench: Benchmarking AI Comput...",LithoBench: Benchmarking AI Computational\nLit...
2,"{""output"": ""Multi-Task Learning with Summary S...",Multi-Task Learning with Summary Statistics\nP...
3,"{""output"": ""EmbodiedGPT: Vision-Language Pre-T...",EmbodiedGPT: Vision-Language Pre-Training via\...
4,"{""output"": ""Finite Population Regression Adjus...",Finite Population Regression Adjustment and\nN...


In [3]:
train_df, valid_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['publishable']  # optional, but recommended if classes are imbalanced
)

print("Train set size:", len(train_df))
print("Validation set size:", len(valid_df))


Train set size: 245
Validation set size: 62


In [4]:
from datasets import Dataset

import re

# Optional: Fill NaNs
train_df['text'] = train_df['text'].fillna("")
valid_df['text'] = valid_df['text'].fillna("")

# Optional: Remove surrogate characters
train_df['text'] = train_df['text'].apply(lambda x: re.sub(r'[\ud800-\udfff]', '', x))
valid_df['text'] = valid_df['text'].apply(lambda x: re.sub(r'[\ud800-\udfff]', '', x))

# Now convert
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
valid_dataset = Dataset.from_pandas(valid_df, preserve_index=False)
train_dataset, valid_dataset


(Dataset({
     features: ['id', 'file_name', 'publishable', 'conference', 'sections', 'text'],
     num_rows: 245
 }),
 Dataset({
     features: ['id', 'file_name', 'publishable', 'conference', 'sections', 'text'],
     num_rows: 62
 }))

In [5]:
from transformers import AutoTokenizer

model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # or 'longest' or dynamic if you prefer
        max_length=512
    )

# Map the tokenize_function over the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

# Set the dataset format for PyTorch
# Make sure "publishable" is included among the columns
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "publishable"])
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "publishable"])


Map:   0%|          | 0/245 [00:00<?, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

In [6]:
# ---- NEW CODE: rename columns right after you've set the format ----
train_dataset = train_dataset.rename_column("publishable", "labels")
valid_dataset = valid_dataset.rename_column("publishable", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [7]:
import torch
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

# If you have a GPU, move the model there
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
! pip install -U 'transformers[torch]'

from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

training_args = TrainingArguments(
    output_dir="./scibert_finetuned",  # directory to store model checkpoints
    evaluation_strategy="epoch",       # evaluate at the end of each epoch
    save_strategy="epoch",             # save a checkpoint at each epoch
    per_device_train_batch_size=4,     # adjust based on your GPU memory
    per_device_eval_batch_size=4,
    num_train_epochs=3,                # feel free to increase
    learning_rate=2e-5,                # typical fine-tuning LR
    logging_dir="./logs",
    load_best_model_at_end=True        # load best model according to eval metric
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

trainer


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m



<transformers.trainer.Trainer at 0x15f376190>

In [9]:
trainer.train()

  0%|          | 0/186 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.29239562153816223, 'eval_accuracy': 0.8870967741935484, 'eval_precision': 0.9423076923076923, 'eval_recall': 0.9245283018867925, 'eval_f1': 0.9333333333333333, 'eval_runtime': 3.3738, 'eval_samples_per_second': 18.377, 'eval_steps_per_second': 4.742, 'epoch': 1.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.22644363343715668, 'eval_accuracy': 0.9354838709677419, 'eval_precision': 0.9298245614035088, 'eval_recall': 1.0, 'eval_f1': 0.9636363636363636, 'eval_runtime': 2.8423, 'eval_samples_per_second': 21.814, 'eval_steps_per_second': 5.629, 'epoch': 2.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.19934281706809998, 'eval_accuracy': 0.9354838709677419, 'eval_precision': 0.9298245614035088, 'eval_recall': 1.0, 'eval_f1': 0.9636363636363636, 'eval_runtime': 2.9597, 'eval_samples_per_second': 20.948, 'eval_steps_per_second': 5.406, 'epoch': 3.0}
{'train_runtime': 148.4414, 'train_samples_per_second': 4.951, 'train_steps_per_second': 1.253, 'train_loss': 0.1693200244698473, 'epoch': 3.0}


TrainOutput(global_step=186, training_loss=0.1693200244698473, metrics={'train_runtime': 148.4414, 'train_samples_per_second': 4.951, 'train_steps_per_second': 1.253, 'total_flos': 193386625689600.0, 'train_loss': 0.1693200244698473, 'epoch': 3.0})

In [10]:
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Detailed classification metrics
from sklearn.metrics import confusion_matrix, classification_report

predictions = trainer.predict(valid_dataset)
y_preds = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_preds))

print("\nClassification Report:")
print(classification_report(y_true, y_preds, digits=3))

  0%|          | 0/16 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.19934281706809998, 'eval_accuracy': 0.9354838709677419, 'eval_precision': 0.9298245614035088, 'eval_recall': 1.0, 'eval_f1': 0.9636363636363636, 'eval_runtime': 3.9271, 'eval_samples_per_second': 15.788, 'eval_steps_per_second': 4.074, 'epoch': 3.0}


  0%|          | 0/16 [00:00<?, ?it/s]


Confusion Matrix:
[[ 5  4]
 [ 0 53]]

Classification Report:
              precision    recall  f1-score   support

           0      1.000     0.556     0.714         9
           1      0.930     1.000     0.964        53

    accuracy                          0.935        62
   macro avg      0.965     0.778     0.839        62
weighted avg      0.940     0.935     0.927        62



In [None]:
import sys
print(sys.executable)