In [None]:
import datasets 
from transformers import PreTrainedTokenizerFast
from transformers import Trainer, TrainingArguments
from transformers import RobertaForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import os 
import warnings
from collections import defaultdict

warnings.filterwarnings("ignore")

MAX_LENGTH = 64

In [None]:
def handle_sample(sample):
    texts = sample['text']
    labels = sample['label']
    
    flattened = defaultdict(list)

    for text, label in zip(texts, labels):
        tokenized = tokenizer(
            text,
            padding='max_length',
            max_length=MAX_LENGTH,
            return_overflowing_tokens=True,
            truncation=True,
            return_special_tokens_mask=True,
        )

        for i in range(len(tokenized['input_ids'])):
            for k in tokenized:
                flattened[k].append(tokenized[k][i])
            flattened['label'].append(label)

    return dict(flattened)

tokenizer = PreTrainedTokenizerFast.from_pretrained("../MalBERTa")
dataset = datasets.load_from_disk("../data/raw")
dataset['test'] = dataset['test'].shuffle().select(range(int(len(dataset['test']) * 0.1)))
dataset['train'] = dataset['train'].shuffle().select(range(int(len(dataset['train']) * 0.1)))
processed_dataset = dataset.map(
    handle_sample,
    remove_columns=dataset['test'].column_names,
    batch_size=64,
    batched=True,
    num_proc=8,
)
del dataset
processed_dataset

In [None]:
from transformers import RobertaConfig 

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "precision": precision_score(labels, predictions, average="weighted", zero_division=0),
        "recall": recall_score(labels, predictions, average="weighted", zero_division=0),
        "f1": f1_score(labels, predictions, average="weighted", zero_division=0),
    }

model = RobertaForSequenceClassification(
    config=RobertaConfig(
        attention_probs_dropout_prob=0.001, 
        bos_token_id=0, 
        eos_token_id=2, 
        hidden_act='gelu_new',
        hidden_dropout_prob=0.01, 
        hidden_size=512, 
        initializer_range=0.02, 
        intermediate_size=2048, 
        layer_norm_eps=0.01, 
        max_position_embeddings=MAX_LENGTH + 2, 
        num_attention_heads=4, 
        num_hidden_layers=7, 
        pad_token_id=1, 
    )
)

train_args = TrainingArguments(
    output_dir="./MalBERTa-classifier",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=256, 
    per_device_eval_batch_size=512, 
    save_strategy="no",
    eval_strategy="epoch",
    logging_steps=1,
    logging_strategy="step",
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=train_args, 
    processing_class=tokenizer,
    train_dataset=processed_dataset['train'],
    eval_dataset=processed_dataset['test'],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
try: 
    trainer.save()
except: 
    pass 

In [None]:
model.save_pretrained('../MalBERTa-classifier')

In [None]:
from transformers import RobertaForSequenceClassification

RobertaForSequenceClassification.from_pretrained("./MalBERTa-classifier")