In [1]:
import pandas as pd

In [2]:
column_names=[
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        # Column 9-13: the total credit history count, including the current statement.
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.

        'context' # Column 14: the context (venue / location of the speech or statement).
]

train_data = pd.read_csv('./liar_dataset/train.tsv', sep='\t', header=None, names=column_names)
test_data  = pd.read_csv('./liar_dataset/test.tsv',  sep='\t', header=None, names=column_names)
valid_data = pd.read_csv('./liar_dataset/valid.tsv', sep='\t', header=None, names=column_names)

In [3]:
from datasets import Dataset

def preprocess_data(data:pd.DataFrame, six_way:bool=True):

    # these data are not usable
    data.drop(columns=[f'count_{i+1}' for i in range(5)], inplace=True)

    # encoding output labels 
    if six_way:
        numerical={'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}
        data['label'] = data['label'].map(numerical)
    else:
        true_labels= ['true', 'mostly-true', 'half-true']
        data['label'] = data['label'].apply(lambda x: 1 if x in true_labels else 0)

    # fill missing columns
    data.fillna('',inplace=True)

    # adding metadata
    data['statement'] = data['statement'] + ' ' + data['speaker'] + ' ' + data['speaker_job_title'] + ' ' + data['state_info'] + ' ' + data['party_affiliation'] + ' ' + data['context']

    # dropping every column other than label and statement
    data.drop(columns=data.columns.difference(['label', 'statement']), inplace=True)

    # data_dict= data.to_dict(orient='records')
    # return data_dict
    return Dataset.from_pandas(data)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
train_data= preprocess_data(train_data)
test_data= preprocess_data(test_data)
valid_data= preprocess_data(valid_data)

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Step 0: Import the model and tokenizer

model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model=AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16)

def tokenize(sample):
    return tokenizer(sample['statement'], padding=True, truncation=True, max_length=1024)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Tokenize the data

train_data= train_data.map(tokenize, batched=True)
test_data= test_data.map(tokenize, batched=True)
valid_data= valid_data.map(tokenize, batched=True)

Map: 100%|██████████| 10240/10240 [00:00<00:00, 14631.31 examples/s]
Map: 100%|██████████| 1267/1267 [00:00<00:00, 7748.95 examples/s]
Map: 100%|██████████| 1284/1284 [00:00<00:00, 23818.73 examples/s]


In [7]:
from pprint import pprint

print(train_data[1]['input_ids'])

[50281, 3039, 858, 253, 10343, 273, 10089, 1265, 32, 733, 3053, 672, 3626, 3678, 2335, 745, 326, 3053, 281, 3135, 275, 313, 20349, 6086, 411, 2698, 11301, 84, 5286, 15, 660, 1519, 14, 84, 1822, 39702, 2418, 24565, 9385, 8738, 255, 247, 5254, 6519, 15, 50282, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283]


In [18]:
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_recall_fscore_support
from transformers import TrainingArguments, Trainer
import numpy as np

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

training_args = TrainingArguments(
    output_dir="./results",           # Directory to save the model
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    save_strategy="epoch",           # Save model at the end of each epoch
    #learning_rate=2e-5,              # Learning rate
    learning_rate=1e-4,                 # Learning rate
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay
    logging_dir="./logs",            # Directory for logging
    logging_steps=10,                # Log every 10 steps
    load_best_model_at_end=True,     # Load the best model at the end of training
    save_total_limit=2               # Limit the number of saved models
)

# Step 8: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [19]:
trainer.train()


  0%|          | 10/3840 [01:43<8:09:32,  7.67s/it]
  0%|          | 10/3840 [01:43<8:09:32,  7.67s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 9.973958333333334e-05, 'epoch': 0.01}


  0%|          | 17/3840 [02:52<11:05:55, 10.45s/it]