# MordenBert

In [1]:
from transformers import AutoTokenizer, ModernBertModel
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler


tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = ModernBertModel.from_pretrained("answerdotai/ModernBERT-base")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

  from .autonotebook import tqdm as notebook_tqdm


## Load data, prepare for training


In [2]:
import pandas as pd

# Load data
train_set = pd.read_csv("../data/event_pairs.train", sep='\t', on_bad_lines='skip')
dev_set = pd.read_csv("../data/event_pairs.dev", sep='\t', on_bad_lines='skip')
test_set = pd.read_csv("../data/event_pairs.test", sep='\t', on_bad_lines='skip')

# Rename columns
col_names = [
    "sentence1",
    "e1_trigger_start",
    "e1_trigger_end",
    "e1_participant1_start",
    "e1_participant1_end",
    "e1_participant2_start",
    "e1_participant2_end",
    "e1_time_start",
    "e1_time_end",
    "e1_loc_start",
    "e1_loc_end",
    "sentence2",
    "e2_trigger_start",
    "e2_trigger_end",
    "e2_participant1_start",
    "e2_participant1_end",
    "e2_participant2_start",
    "e2_participant2_end",
    "e2_time_start",
    "e2_time_end",
    "e2_loc_start",
    "e2_loc_end",
    "label"
]

train_set.columns = col_names
dev_set.columns = col_names
test_set.columns = ['event_id_1', 'event_id_2'] + col_names

for dataset in [train_set, dev_set, test_set]:
    dataset.drop(columns=['e1_participant1_start', 'e1_participant1_end', 'e1_participant2_start', 'e1_participant2_end', 'e1_time_start', 'e1_time_end', 'e1_loc_start', 'e1_loc_end', 'e2_participant1_start', 'e2_participant1_end', 'e2_participant2_start', 'e2_participant2_end', 'e2_time_start', 'e2_time_end', 'e2_loc_start', 'e2_loc_end'], inplace=True)

In [None]:
import torch
from torch.utils.data import Dataset
import pandas as pd
from transformers import BertTokenizer

class CDECDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_len=128):
        """
        Args:
            data_path (str): Path to the CSV/TSV data file
            tokenizer: BERT tokenizer
            max_len (int): Maximum length of tokens
        """
        self.data = pd.read_csv(data_path, sep='\t')
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.iloc[idx]['text'])
        label = self.data.iloc[idx]['label']
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [8]:
from transformers import AutoTokenizer, ModernBertForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = ModernBertForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", attn_implementation="flash_attention_2")

inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)

labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
# mask labels of non-[MASK] tokens
labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)

outputs = model(**inputs, labels=labels)

tensor([6])
