In [1]:
import pandas as pd

<font size=4> Loading The Data </font>

In [2]:
column_names=[
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        # Column 9-13: the total credit history count, including the current statement.
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.

        'context' # Column 14: the context (venue / location of the speech or statement).
]

train_data = pd.read_csv('./liar_dataset/train.tsv', sep='\t', header=None, names=column_names)
test_data  = pd.read_csv('./liar_dataset/test.tsv',  sep='\t', header=None, names=column_names)
valid_data = pd.read_csv('./liar_dataset/valid.tsv', sep='\t', header=None, names=column_names)

<font size=4> Preprocessing the Data </font>

In [3]:
from datasets import Dataset

def preprocess_data(data:pd.DataFrame, six_way:bool=True):

    # these data are not usable
    data.drop(columns=[f'count_{i+1}' for i in range(5)], inplace=True)

    # encoding output labels 
    if six_way:
        numerical={'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}
        data['label'] = data['label'].map(numerical)
    else:
        true_labels= ['true', 'mostly-true', 'half-true']
        data['label'] = data['label'].apply(lambda x: 1 if x in true_labels else 0)

    # fill missing columns
    data.fillna('',inplace=True)

    # adding metadata
    data['statement'] = data['statement'] + ' ' + data['speaker'] + ' ' + data['speaker_job_title'] + ' ' + data['state_info'] + ' ' + data['party_affiliation'] + ' ' + data['context']

    # dropping every column other than label and statement
    data.drop(columns=data.columns.difference(['label', 'statement']), inplace=True)

    # data_dict= data.to_dict(orient='records')
    # return data_dict
    return Dataset.from_pandas(data)


In [4]:
train_data= preprocess_data(train_data)
test_data= preprocess_data(test_data)
valid_data= preprocess_data(valid_data)

<font size=4> I like it better when it is a single dataset </font>

In [5]:
from datasets import dataset_dict

dataset_dict= dataset_dict.DatasetDict()

dataset_dict['train']= train_data
dataset_dict['test']= test_data
dataset_dict['valid']= valid_data

In [6]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'statement'],
        num_rows: 10240
    })
    test: Dataset({
        features: ['label', 'statement'],
        num_rows: 1267
    })
    valid: Dataset({
        features: ['label', 'statement'],
        num_rows: 1284
    })
})

### Tokenizer

In [7]:
from transformers import AutoTokenizer
import torch

model_id = "answerdotai/ModernBERT-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.model_max_length=1024


# Tokenize function
def tokenizer_helper(sample):
    return tokenizer(sample['statement'], padding='max_length', truncation=True, return_tensors="pt")

In [8]:
# Tokenize the data

tokenized_dataset= dataset_dict.map(tokenizer_helper, batched=True, remove_columns=['statement'])

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]

Map:   0%|          | 0/1267 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 10240
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1267
    })
    valid: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1284
    })
})

### Classifier

In [10]:
from transformers import AutoModelForSequenceClassification

model_id = "answerdotai/ModernBERT-base"


labels_2_id={'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}
id_2_labels={v:k for k,v in labels_2_id.items()}


model=AutoModelForSequenceClassification.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, label2id=labels_2_id, id2label=id_2_labels, num_labels=6
)


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Setting the model to run on device

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model=model.to(device)

In [22]:
import numpy as np
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score
 
# Metric helper method
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [24]:
from transformers import Trainer, TrainingArguments
 
# Define training args
training_args = TrainingArguments(
    output_dir= "modernbert-llm-router",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-5,
    num_train_epochs=3,
    bf16=True, # bfloat16 training 
    optim="adamw_torch_fused", # improved optimizer 
    weight_decay=0.01,
    # logging & evaluation strategies
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)
 
# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['valid'],
    compute_metrics=compute_metrics,
)

```python
learning rate 2e-5
```

In [25]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.6806,1.714929,0.248442,0.227584,0.264276,0.248442
2,1.6922,1.709943,0.257788,0.236033,0.262224,0.257788
3,1.669,1.710153,0.257788,0.233415,0.262121,0.257788


TrainOutput(global_step=1920, training_loss=1.6872058868408204, metrics={'train_runtime': 790.0439, 'train_samples_per_second': 38.884, 'train_steps_per_second': 2.43, 'total_flos': 2.093674263478272e+16, 'train_loss': 1.6872058868408204, 'epoch': 3.0})

In [26]:
results=trainer.evaluate(tokenized_dataset['test'])

In [27]:
results

{'eval_loss': 1.70524001121521,
 'eval_accuracy': 0.24546172059984214,
 'eval_f1': 0.21830524330216378,
 'eval_precision': 0.23530852189219606,
 'eval_recall': 0.24546172059984214,
 'eval_runtime': 9.9723,
 'eval_samples_per_second': 127.052,
 'eval_steps_per_second': 8.022,
 'epoch': 3.0}

```python 
 learning rate 3e-5 
 ```


In [28]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.6632,1.707707,0.253894,0.234658,0.262722,0.253894
2,1.6811,1.705522,0.259346,0.241116,0.270224,0.259346
3,1.6557,1.70475,0.260903,0.241256,0.26934,0.260903


TrainOutput(global_step=1920, training_loss=1.672683842976888, metrics={'train_runtime': 786.1779, 'train_samples_per_second': 39.075, 'train_steps_per_second': 2.442, 'total_flos': 2.093674263478272e+16, 'train_loss': 1.672683842976888, 'epoch': 3.0})

In [29]:
results=trainer.evaluate(tokenized_dataset['test'])
results

{'eval_loss': 1.7024528980255127,
 'eval_accuracy': 0.24704025256511444,
 'eval_f1': 0.22468780805268285,
 'eval_precision': 0.24214275311439318,
 'eval_recall': 0.24704025256511444,
 'eval_runtime': 9.9392,
 'eval_samples_per_second': 127.475,
 'eval_steps_per_second': 8.049,
 'epoch': 3.0}