In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Paths
dataset_path = "./data/csic_database_labeled.csv"
output_dir = "./codebert_finetuned_csic"

# Load dataset
df = pd.read_csv(dataset_path)

# Verify required columns
required_columns = ['llm_input', 'label']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"Dataset must contain columns: {required_columns}")


In [3]:

# Split dataset: 80% train, 10% validation, 10% test
train_val, test = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])
train, val = train_test_split(train_val, test_size=0.1111, random_state=42, stratify=train_val['label'])
# Results in ~80% train, ~10% val, ~10% test (0.1111 = 10/90)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train[['llm_input', 'label']])
val_dataset = Dataset.from_pandas(val[['llm_input', 'label']])
test_dataset = Dataset.from_pandas(test[['llm_input', 'label']])

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples['llm_input'],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )


In [4]:

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


Map: 100%|██████████| 48852/48852 [00:07<00:00, 6129.13 examples/s]
Map: 100%|██████████| 6106/6106 [00:00<00:00, 6421.32 examples/s]
Map: 100%|██████████| 6107/6107 [00:00<00:00, 6327.40 examples/s]


In [5]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/codebert-base", num_labels=2
)

device = "mps"
model.to(device)


# Define compute_metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Effective batch size = 8 * 2 = 16
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=100,
    seed=42,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU supports it
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Train model
print("Starting fine-tuning...")
trainer.train()

Starting fine-tuning...




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:

# Evaluate on test set
print("\nEvaluating on test set...")
test_results = trainer.evaluate(test_dataset)
print("Test set results:")
for metric, value in test_results.items():
    print(f"{metric}: {value:.4f}")

# Save model and tokenizer
print(f"\nSaving model to {output_dir}")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Example inference on a single request
sample_input = "Method: GET\nURL: /tienda1/index.jsp\nContent: None"
sample_tokenized = tokenizer(sample_input, return_tensors="pt", max_length=512, truncation=True, padding=True)
model.eval()
with torch.no_grad():
    outputs = model(**{k: v.to(model.device) for k, v in sample_tokenized.items()})
    predicted_label = outputs.logits.argmax().item()
print(f"\nSample input: {sample_input}")
print(f"Predicted label: {'Normal' if predicted_label == 0 else 'Anomalous'}")