# Leveraging SciBERT for Enhanced Named Entity Recognition in Computer Science Literature

## Preprocessing

In [1]:
import arxiv

In [2]:
client = arxiv.Client()

# categories:

# Artificial Intelligence (cs.AI)
# Machine Learning (cs.LG)
# Computer Vision and Pattern Recognition (cs.CV)
# Robotics (cs.RO)
# Information Retrieval (cs.IR)
# Emerging Technologies (cs.ET)
# Neural and Evolutionary Computing (cs.NE)
# Multi-agent systems (cs.MA)
# Computational Linguistics (cs.CL)
# Social and Information Networks (cs.SI)


search = arxiv.Search(
    query = "cat:cs.AI OR cat:cs.LG OR cat:cs.CV OR cat:cs.RO OR cat:cs.IR OR cat:cs.ET OR cat:cs.NE OR cat:cs.MA OR cat:cs.CL OR cat:cs.SI",
    max_results=1000,
    sort_by=arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)


In [3]:
with open ('papers.txt', 'w') as f:
    for r in client.results(search):
        f.write(r.title + '\n' + r.summary + '\n\n')

In [13]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)


Download already complete (1444409270 bytes).
Extracting files...
Path to dataset files: /Users/jarenbresnick/.cache/kagglehub/datasets/Cornell-University/arxiv/versions/201


In [18]:
import json

# Load the JSON file line by line
filtered_articles = []
categories_of_interest = ["cs.AI", "cs.LG", "cs.CV", "cs.RO", "cs.IR", "cs.ET", "cs.NE", "cs.MA", "cs.CL", "cs.SI"]

# Open the file
with open(path+"/arxiv-metadata-oai-snapshot.json", 'r') as file:
    for line in file:
        # Parse each line as a JSON object
        try:
            article = json.loads(line)
            # Check if any of the categories of interest are present
            if any(cat in article.get('categories', '') for cat in categories_of_interest):
                filtered_articles.append(article)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue  # Skip lines that can't be parsed

# Save the filtered articles to a new JSON file (optional)
with open('filtered_articles.json', 'w') as outfile:
    json.dump(filtered_articles, outfile, indent=4)

# Print the number of filtered articles
print(f"Number of filtered articles: {len(filtered_articles)}")



Number of filtered articles: 428211


In [25]:
import json

# Load the JSON file line by line
filtered_articles = []
categories_of_interest = ["cs.AI", "cs.LG", "cs.CV", "cs.RO", "cs.IR", "cs.ET", "cs.NE", "cs.MA", "cs.CL", "cs.SI"]

# Open the file and process line by line
file_path = path + '/arxiv-metadata-oai-snapshot.json'
output_file = 'filtered_articles.txt'

with open(file_path, 'r') as file, open(output_file, 'w') as output:
    for line in file:
        try:
            # Parse each line as a JSON object
            article = json.loads(line)
            
            # Check if any of the categories of interest are present
            if any(cat in article.get('categories', '') for cat in categories_of_interest):
                title = article.get('title', 'No Title').strip()
                abstract = article.get('abstract', 'No Abstract').strip()

                # Write the title and abstract to the output text file
                output.write(f"{title}\n{abstract}\n\n")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue  # Skip lines that can't be parsed

# Inform the user that the file has been written
print(f"Filtered articles have been written to {output_file}")


Filtered articles have been written to filtered_articles.txt


In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, DatasetDict
import numpy as np
from sklearn.metrics import accuracy_score

# Load the SciBERT model and tokenizer
model = AutoModelForMaskedLM.from_pretrained("allenai/scibert_scivocab_uncased")
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Load the dataset
dataset = load_dataset('text', data_files={'train': 'filtered_articles.txt'})

# Split the dataset into train and validation (e.g., 90% train, 10% validation)
split_dataset = dataset['train'].train_test_split(test_size=0.1)

# Rename the split to 'train' and 'validation'
split_dataset = DatasetDict({
    'train': split_dataset['train'],
    'validation': split_dataset['test']
})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

# Create a data collator for dynamic masking
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15  # 15% of tokens will be masked
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Smaller batch size
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Accumulates gradients over 2 steps (effectively doubling batch size)
    num_train_epochs=3,
    weight_decay=0.01,
)


# Custom metrics function to compute perplexity and accuracy
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)

    # Mask the labels where they are -100 (those shouldn't count in accuracy)
    mask = labels != -100
    accuracy = accuracy_score(labels[mask], predictions[mask])
    
    # Compute perplexity as exponential of the loss
    loss = np.mean(eval_pred.loss) if hasattr(eval_pred, "loss") else 0.0
    perplexity = np.exp(loss) if loss else float("inf")
    
    return {
        "accuracy": accuracy,
        "perplexity": perplexity,
        "loss": loss
    }

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Add custom evaluation metrics
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")