In [12]:
import argparse
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import sys
import re

def load_model(model_path):
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    return model, tokenizer

def parse_log_file(file_path):
    # Reading the log file content
    with open(file_path, 'r') as file:
        content = file.readlines()
    
    logs = []
    for line in content:
        timestamp, message = None, None
        
        # Format 1: ASA logs
        asa_pattern = re.compile(r'(\w+ \d+ \d{4} \d{2}:\d{2}:\d{2}): (.+)')
        match = asa_pattern.match(line)
        if match:
            timestamp, message = match.groups()
            message_id_match = re.search(r'%ASA-\d+-\d+:', message)
            message_id = message_id_match.group(0) if message_id_match else "Unknown"
            readable_message = message.replace(message_id, "").strip()
        
        # Format 2: Generic logs
        generic_pattern = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}), (Info|Warning|Error)\s*(.+)')
        match = generic_pattern.match(line)
        if match:
            timestamp, level, message = match.groups()
            message_id = "Unknown"
            readable_message = message.strip()
        
        if timestamp and message:
            logs.append({'timestamp': timestamp, 'message_id': message_id, 'readable_message': readable_message})
    
    if not logs:
        raise ValueError("No valid log entries found. Please check the log file format.")
    
    return pd.DataFrame(logs)

def process_csv(file_path, tokenizer):
    try:
        df = parse_log_file(file_path)
        df['tokens'] = df['readable_message'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
    except KeyError as e:
        raise KeyError(f"Error processing CSV file: {e}")
    except Exception as e:
        raise Exception(f"An error occurred while processing the CSV file: {e}")
    
    return df

def infer(model, tokenizer, df):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    inputs = df['tokens'].tolist()
    inputs_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(t) for t in inputs], batch_first=True, padding_value=0)
    inputs_padded = inputs_padded.to(device)
    
    with torch.no_grad():
        outputs = model(inputs_padded)
        preds = torch.argmax(outputs.logits, dim=1).cpu().tolist()
    
    df['category'] = preds
    return df

def human_readable_output(df):
    df['category'] = df['category'].apply(lambda x: 'Error' if x == 1 else 'Non-Error')
    return df[['timestamp', 'message_id', 'readable_message', 'category']]

def main(csv_file=None, model_path=None):
    if csv_file and model_path:
        try:
            model, tokenizer = load_model(model_path)
            df = process_csv(csv_file, tokenizer)
            df = infer(model, tokenizer, df)
            readable_df = human_readable_output(df)
            
            # Save the structured output to a CSV file
            output_csv_path = 'structured_output.csv'
            readable_df.to_csv(output_csv_path, index=False)
            print(f"Structured output saved to {output_csv_path}")
        except Exception as e:
            print(f"An error occurred: {e}")
    else:
        parser = argparse.ArgumentParser(description="Log Categorization Script")
        parser.add_argument('csv_file', type=str, help='Path to the CSV file containing logs')
        parser.add_argument('model_path', type=str, help='Path to the pre-trained model')
        args = parser.parse_args()
        
        try:
            model, tokenizer = load_model(args.model_path)
            df = process_csv(args.csv_file, tokenizer)
            df = infer(model, tokenizer, df)
            readable_df = human_readable_output(df)
            
            # Save the structured output to a CSV file
            output_csv_path = 'structured_output.csv'
            readable_df.to_csv(output_csv_path, index=False)
            print(f"Structured output saved to {output_csv_path}")
        except Exception as e:
            print(f"An error occurred: {e}")

# Define default paths for interactive environment
csv_file_path = 'cisco_log.txt'  # Update this path based on the new log file
model_path = '/Users/nithinrajulapati/Downloads/LLM for Logging/trained_model_from_SCRATCH'

if __name__ == "__main__":
    if 'ipykernel_launcher' in sys.argv[0]:
        # This is while Running in an interactive environment
        main(csv_file=csv_file_path, model_path=model_path)
    else:
        # this Running from the command line
        main()


Structured output saved to structured_output.csv
