# Leveraging SciBERT for Enhanced Named Entity Recognition in Computer Science Literature

## Preprocessing

In [1]:
import arxiv

In [2]:
client = arxiv.Client()

# categories:

# Artificial Intelligence (cs.AI)
# Machine Learning (cs.LG)
# Computer Vision and Pattern Recognition (cs.CV)
# Robotics (cs.RO)
# Information Retrieval (cs.IR)
# Emerging Technologies (cs.ET)
# Neural and Evolutionary Computing (cs.NE)
# Multi-agent systems (cs.MA)
# Computational Linguistics (cs.CL)
# Social and Information Networks (cs.SI)


search = arxiv.Search(
    query = "cat:cs.AI OR cat:cs.LG OR cat:cs.CV OR cat:cs.RO OR cat:cs.IR OR cat:cs.ET OR cat:cs.NE OR cat:cs.MA OR cat:cs.CL OR cat:cs.SI",
    max_results=1000,
    sort_by=arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)


In [3]:
with open ('papers.txt', 'w') as f:
    for r in client.results(search):
        f.write(r.title + '\n' + r.summary + '\n\n')

In [4]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, DatasetDict

# Load the SciBERT model and tokenizer
model = AutoModelForMaskedLM.from_pretrained("allenai/scibert_scivocab_uncased")
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Load the dataset
dataset = load_dataset('text', data_files={'train': 'papers.txt'})

# Split the dataset into train and validation (e.g., 90% train, 10% validation)
split_dataset = dataset['train'].train_test_split(test_size=0.1)

# Rename the split to 'train' and 'validation'
split_dataset = DatasetDict({
    'train': split_dataset['train'],
    'validation': split_dataset['test']
})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

# Create a data collator for dynamic masking
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15  # 15% of tokens will be masked
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,  # Ensure data collator is used to create MLM masking
)

# Fine-tune the model
trainer.train()



Map:   0%|          | 0/17402 [00:00<?, ? examples/s]

Map:   0%|          | 0/1934 [00:00<?, ? examples/s]

  0%|          | 0/3264 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.