# Leveraging SciBERT for Enhanced Named Entity Recognition in Computer Science Literature

## Preprocessing

In [1]:
import arxiv

In [2]:
client = arxiv.Client()

# categories:

# Artificial Intelligence (cs.AI)
# Machine Learning (cs.LG)
# Computer Vision and Pattern Recognition (cs.CV)
# Robotics (cs.RO)
# Information Retrieval (cs.IR)
# Emerging Technologies (cs.ET)
# Neural and Evolutionary Computing (cs.NE)
# Multi-agent systems (cs.MA)
# Computational Linguistics (cs.CL)
# Social and Information Networks (cs.SI)


search = arxiv.Search(
    query = "cat:cs.AI OR cat:cs.LG OR cat:cs.CV OR cat:cs.RO OR cat:cs.IR OR cat:cs.ET OR cat:cs.NE OR cat:cs.MA OR cat:cs.CL OR cat:cs.SI",
    max_results=1000,
    sort_by=arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)


In [3]:
with open ('papers.txt', 'w') as f:
    for r in client.results(search):
        f.write(r.title + '\n' + r.summary + '\n\n')

In [6]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, DatasetDict
import numpy as np
from sklearn.metrics import accuracy_score

# Load the SciBERT model and tokenizer
model = AutoModelForMaskedLM.from_pretrained("allenai/scibert_scivocab_uncased")
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Load the dataset
dataset = load_dataset('text', data_files={'train': 'papers.txt'})

# Split the dataset into train and validation (e.g., 90% train, 10% validation)
split_dataset = dataset['train'].train_test_split(test_size=0.1)

# Rename the split to 'train' and 'validation'
split_dataset = DatasetDict({
    'train': split_dataset['train'],
    'validation': split_dataset['test']
})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

# Create a data collator for dynamic masking
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15  # 15% of tokens will be masked
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Smaller batch size
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Accumulates gradients over 2 steps (effectively doubling batch size)
    num_train_epochs=3,
    weight_decay=0.01,
)


# Custom metrics function to compute perplexity and accuracy
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)

    # Mask the labels where they are -100 (those shouldn't count in accuracy)
    mask = labels != -100
    accuracy = accuracy_score(labels[mask], predictions[mask])
    
    # Compute perplexity as exponential of the loss
    loss = np.mean(eval_pred.loss) if hasattr(eval_pred, "loss") else 0.0
    perplexity = np.exp(loss) if loss else float("inf")
    
    return {
        "accuracy": accuracy,
        "perplexity": perplexity,
        "loss": loss
    }

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Add custom evaluation metrics
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")



Map:   0%|          | 0/1934 [00:00<?, ? examples/s]

  0%|          | 0/3264 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 4.43 GB, other allocations: 2.27 GB, max allowed: 6.77 GB). Tried to allocate 91.08 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).