# Now Developing the LLM to analyze the file and produce the human readable output

In [1]:
import pandas as pd
import re

# Load the log file
log_file_path = 'cisco_log.txt'

# Define a function to parse the log entries
def parse_log(log_content):
    log_entries = []
    log_pattern = re.compile(r'(?P<date>\w+ +\d+ \d+:\d+:\d+).+:(?P<message>.+)')
    for line in log_content.split('\n'):
        if line.strip():
            match = log_pattern.match(line)
            if match:
                log_entries.append(match.groupdict())
            else:
                # Handle cases where the line does not match the expected format
                log_entries.append({'date': None, 'message': line.strip()})
    return log_entries

# Read and parse the log file
with open(log_file_path, 'r') as file:
    log_content = file.read()
    log_entries = parse_log(log_content)

# Convert to a DataFrame
df_logs = pd.DataFrame(log_entries)

# Save to CSV
csv_file_path = 'structured_logs.csv'
df_logs.to_csv(csv_file_path, index=False)
csv_file_path


'structured_logs.csv'

# Parsing 

In [2]:
import pandas as pd
from transformers import BertTokenizer

# Load the structured log file
csv_file_path = 'structured_logs.csv'
df_logs = pd.read_csv(csv_file_path)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the log messages
df_logs['tokens'] = df_logs['message'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
df_logs.head()


Unnamed: 0,date,message,tokens
0,,{\rtf1\ansi\ansicpg1252\cocoartf2761,"[101, 1063, 1032, 19387, 2546, 2487, 1032, 201..."
1,,\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\...,"[101, 1032, 22940, 18209, 15782, 2989, 2692, 1..."
2,,{\colortbl;\red255\green255\blue255;},"[101, 1063, 1032, 3609, 2102, 16558, 1025, 103..."
3,,{\*\expandedcolortbl;;},"[101, 1063, 1032, 1008, 1032, 4423, 18717, 210..."
4,,\margl1440\margr1440\vieww11520\viewh8400\view...,"[101, 1032, 9388, 23296, 16932, 12740, 1032, 9..."


# Initializing and training the bert model

In [3]:
import torch
from transformers import BertModel, BertConfig

# Configure the BERT model
config = BertConfig.from_pretrained('bert-base-uncased')
model = BertModel(config)

# Example of how the model can be used
sample_input = df_logs['tokens'].iloc[0]
sample_input_tensor = torch.tensor(sample_input).unsqueeze(0)  # Add batch dimension

# Pass the input through the model
outputs = model(sample_input_tensor)
print(outputs)


BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 1.1657, -0.0562, -1.1300,  ..., -0.0024,  0.0442, -1.2355],
         [-0.3658,  0.1583, -0.5717,  ...,  0.6249, -1.3246,  0.5423],
         [-0.1087,  0.8095, -0.8744,  ...,  0.3942, -1.1411, -0.5234],
         ...,
         [ 1.1914,  0.3439, -1.5971,  ...,  0.4967, -0.1702, -0.6801],
         [ 0.3705,  0.2289, -0.6759,  ..., -0.1672, -1.6718, -1.7447],
         [ 0.9166,  0.9594, -1.5302,  ...,  0.9869,  0.0999, -1.5372]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-2.9979e-01, -1.5988e-01, -7.4318e-01,  1.5011e-01,  3.7035e-01,
          6.7062e-01,  8.5490e-02, -3.8273e-01, -1.6664e-01, -1.8253e-01,
         -3.7210e-01, -3.1502e-01,  5.0265e-01,  4.0950e-02, -1.7972e-01,
          3.0770e-01, -4.7946e-01,  8.0233e-02, -6.9517e-01,  2.7487e-01,
         -5.0945e-01, -4.3295e-01, -7.1375e-01, -8.8962e-01, -4.1633e-01,
          2.6902e-01, -5.2506e-01,  4.4854e-02,  2.4527e-01, -3.957

In [4]:
import torch
from transformers import AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

class LogDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return torch.tensor(self.df['tokens'].iloc[idx]), torch.tensor(self.df['label'].iloc[idx])

def collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return inputs_padded, labels

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the structured log file
csv_file_path = 'structured_logs.csv'
df_logs = pd.read_csv(csv_file_path)

# Tokenize the log messages
df_logs['tokens'] = df_logs['message'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Assuming we have labels for supervised learning
df_logs['label'] = df_logs['message'].apply(lambda x: 1 if 'error' in x.lower() else 0)  # Example labeling
dataset = LogDataset(df_logs)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * 3)

# Training loop
model.train()
for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch + 1} completed")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1 completed
Epoch 2 completed
Epoch 3 completed


In [5]:
# Function to decode tokens to human-readable text
def decode_tokens(tokens):
    return tokenizer.decode(tokens, skip_special_tokens=True)

# Example usage
sample_output_tokens = outputs.logits[0].argmax(dim=-1).detach().numpy()  # Replace with actual output tokens
human_readable_output = decode_tokens(sample_output_tokens)
print(human_readable_output)


[ P A D ]


In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming we have a validation set
validation_loader = DataLoader(dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

model.eval()
preds = []
true_labels = []
with torch.no_grad():
    for batch in validation_loader:
        inputs, labels = batch
        outputs = model(inputs)
        preds.extend(torch.argmax(outputs.logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, preds)
precision = precision_score(true_labels, preds)
recall = recall_score(true_labels, preds)
f1 = f1_score(true_labels, preds)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 1.0
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


# Handling the issues 

In [7]:
# Labeling logic
df_logs['label'] = df_logs['message'].apply(lambda x: 1 if 'error' in x.lower() else 0)

# Check label distribution
label_counts = df_logs['label'].value_counts()
print(label_counts)


label
0    96
Name: count, dtype: int64


In [8]:
# Check the label distribution again
label_counts = df_logs['label'].value_counts()
print(label_counts)


label
0    96
Name: count, dtype: int64


In [9]:
# Adjusted labeling criteria
error_keywords = ['deny', 'fail', 'error', 'denied']  # Add more keywords if needed
df_logs['label'] = df_logs['message'].apply(lambda x: 1 if any(kw in x.lower() for kw in error_keywords) else 0)

# Check the label distribution again
label_counts = df_logs['label'].value_counts()
print(label_counts)


label
0    68
1    28
Name: count, dtype: int64


In [10]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df_logs[df_logs.label == 0]
df_minority = df_logs[df_logs.label == 1]

# Upsample minority class if it exists
if not df_minority.empty:
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,    # sample with replacement
                                     n_samples=len(df_majority),  # to match majority class
                                     random_state=42)  # reproducible results

    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])
else:
    df_upsampled = df_logs  # If no minority samples, use the original dataframe

# Display new class counts
print(df_upsampled['label'].value_counts())


label
0    68
1    68
Name: count, dtype: int64


In [11]:
# Redefine dataset and dataloader
dataset = LogDataset(df_upsampled)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Assuming a separate validation set is available
validation_dataset = LogDataset(df_logs)
validation_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [12]:
# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * 3)

# Training loop
model.train()
for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch + 1} completed")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed
Epoch 2 completed
Epoch 3 completed


# Evaluate the model performance 

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()
preds = []
true_labels = []
with torch.no_grad():
    for batch in validation_loader:
        inputs, labels = batch
        outputs = model(inputs)
        preds.extend(torch.argmax(outputs.logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, preds)
precision = precision_score(true_labels, preds, zero_division=1)
recall = recall_score(true_labels, preds, zero_division=1)
f1 = f1_score(true_labels, preds, zero_division=1)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.5625
Precision: 0.36538461538461536
Recall: 0.6785714285714286
F1 Score: 0.475


In [14]:
# Assuming `model` is your trained model
model_save_path = 'Trained Model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


('Trained Model/tokenizer_config.json',
 'Trained Model/special_tokens_map.json',
 'Trained Model/vocab.txt',
 'Trained Model/added_tokens.json')