# Importing the data and preprocessing it

In [1]:
import pandas as pd
import re
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.nn.utils.rnn import pad_sequence
from sklearn.utils import resample

# Load the log file
log_file_path = 'cisco_log.txt'

# Define a function to parse the log entries
def parse_log(log_content):
    log_entries = []
    log_pattern = re.compile(r'(?P<date>\w+ +\d+ \d+:\d+:\d+).+:(?P<message>.+)')
    for line in log_content.split('\n'):
        if line.strip():
            match = log_pattern.match(line)
            if match:
                log_entries.append(match.groupdict())
            else:
                log_entries.append({'date': None, 'message': line.strip()})
    return log_entries

# Read and parse the log file
with open(log_file_path, 'r') as file:
    log_content = file.read()
    log_entries = parse_log(log_content)

# Convert to a DataFrame
df_logs = pd.DataFrame(log_entries)

# Labeling logic
error_keywords = ['deny', 'fail', 'error', 'denied']
df_logs['label'] = df_logs['message'].apply(lambda x: 1 if any(kw in x.lower() for kw in error_keywords) else 0)

# Separate majority and minority classes
df_majority = df_logs[df_logs.label == 0]
df_minority = df_logs[df_logs.label == 1]

# Upsample minority class if it exists
if not df_minority.empty:
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,    
                                     n_samples=len(df_majority),  
                                     random_state=42)

    df_upsampled = pd.concat([df_majority, df_minority_upsampled])
else:
    df_upsampled = df_logs  

# Display new class counts
print(df_upsampled['label'].value_counts())


label
0    68
1    68
Name: count, dtype: int64


# Tokenizing

In [3]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the log messages
df_upsampled['tokens'] = df_upsampled['message'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))


# Dataset and Dataloader

In [4]:
class LogDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return torch.tensor(self.df['tokens'].iloc[idx]), torch.tensor(self.df['label'].iloc[idx])

def collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return inputs_padded, labels

# Create dataset and dataloader
dataset = LogDataset(df_upsampled)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)


# Model Initialization and Training

In [5]:
# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * 3)

# Training loop
model.train()
for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Epoch {epoch + 1} completed")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1 completed
Epoch 2 completed
Epoch 3 completed


# Model Evaluation

In [6]:
# Assuming a separate validation set is available
validation_dataset = LogDataset(df_upsampled)
validation_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

model.eval()
preds = []
true_labels = []
with torch.no_grad():
    for batch in validation_loader:
        inputs, labels = batch
        outputs = model(inputs)
        preds.extend(torch.argmax(outputs.logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, preds)
precision = precision_score(true_labels, preds, zero_division=1)
recall = recall_score(true_labels, preds, zero_division=1)
f1 = f1_score(true_labels, preds, zero_division=1)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.6397058823529411
Precision: 0.6301369863013698
Recall: 0.6764705882352942
F1 Score: 0.652482269503546


In [7]:
model_save_path = 'trained_model_from_SCRATCH'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


('trained_model_from_SCRATCH/tokenizer_config.json',
 'trained_model_from_SCRATCH/special_tokens_map.json',
 'trained_model_from_SCRATCH/vocab.txt',
 'trained_model_from_SCRATCH/added_tokens.json')