In [None]:
# Core
import time
import re
import numpy as np
import pandas as pd
import torch

# Transformers / Unsloth
from transformers import AutoTokenizer, AutoModelForCausalLM
from unsloth import FastLanguageModel

# LangChain pieces you actually use
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Metrics
from sklearn.metrics.pairwise import cosine_similarity


**Step 1: Preprocessing**

In [3]:
# Load Context Dataset and Test Dataset

TestDoc = pd.read_csv("Context.csv") #Context dataset
TestLogs = pd.read_csv("TestLogs.csv") #Test dataset
TestLogs_Cols = TestLogs[['Content','EventTemplate','Source','Category']]

documents = [
    Document(page_content=f"Log: {row['Content']}\nTemplate: {row['EventTemplate']}")
    for _, row in TestDoc.iterrows()
]


In [4]:
# Load Rule Dataset

rules_df = pd.read_csv("rules.csv")
rules_dict = {
    row['RuleNumber']: {
        'rule': row['Rule'],
        'example': row['Example'],
        'template': row['Template']
    }
    for _, row in rules_df.iterrows()
}

In [5]:
#Chunking

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50
)

splits = text_splitter.split_documents(documents)

In [None]:
# Load Embedding Model

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Embedding Context Data Chunks

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings
)

**Step 2: Retrieval**

In [7]:
# Similarity-based Retrieval

def similarity_retrieval(query, threshold=0.7, k=3):
    query_embedding = embeddings.embed_query(query)
    results = vectorstore.similarity_search_by_vector(query_embedding, k=k)

    filtered_results = []
    similarity_scores = []
    for doc in results:
        doc_embedding = embeddings.embed_query(doc.page_content)
        sim = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        if sim >= threshold:
            filtered_results.append(doc.page_content)
            similarity_scores.append(sim)

    similarity_context_str = "\n".join(filtered_results) if filtered_results else ""
    similarity_score = float(np.mean(similarity_scores)) if similarity_scores else 0.0

    return similarity_context_str, similarity_score, len(filtered_results) > 0

In [17]:
# Rule-matching

def match_with_priority(text):
    matched_spans = []
    results = {
        'rule1': [],
        'rule2': [],
        'rule3': [],
        'rule4': [],
        'rule5': [],
        'rule6': [],
        'rule7': [],
        'rule8': [],
        'rule9': [],
        'rule10': []
    }

    # Rules regex matching ,Priority-ordered patterns with weights
    patterns = {
        'rule1': (r'/(?:\d{1,3}\.){3}\d{1,3}:\d+\b', 0.9),  # IP with Port
        'rule2': (r'(/[^\s]+(?:\.[^\s]+)?)', 0.7),         # File Path
        'rule3': (r'\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b', 0.9),  # MAC address
        'rule4': (r'\b(?:\d{1,3}\.){3}\d{1,3}\b', 0.8),   # IP address
        'rule5': (r'\b(?:\d{1,3}\.){3}\d{1,3}:\d+\b|\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}:\d+\b', 0.8),  #Domain:Port
        'rule6': (r'\b0x[0-9A-Fa-f]+\b', 0.6),            # Hexadecimal Token
        'rule7': (r'\b[\w]+(:[\w]+){2,}\b', 0.6),         # Timestamp
        'rule8': (r'(?<![\d:.])\b\d+\b(?![:.])', 0.5),    # Numeric Identifier
        'rule9': (r'\b(?=[\w.-]*[A-Za-z])(?=[\w.-]*\d)[\w.-]+\b', 0.5),  # Alphanumeric Token
        'rule10': (r'\b[A-Za-z ]+\b', 0.3),               # Static Message
    }

    def is_overlapping(start, end):
        for s, e in matched_spans:
            if start < e and end > s:
                return True
        return False

    for rule_name, (pattern, _) in patterns.items():
        for match in re.finditer(pattern, text):
            start, end = match.span()
            if not is_overlapping(start, end):
                matched_spans.append((start, end))
                results[rule_name].append(match.group())

    final_output = {}
    for rule, matches in results.items():
        final_output[rule] = {
            'match': 'yes' if matches else 'no',
            'patterns': matches
        }
    return final_output, patterns




In [20]:
# Rule-based Retrieval
def rule_based_retrieval(query):
    rule_matches, patterns = match_with_priority(query)

    rule_context = []
    rule_score = 0.0
    matched_rules = 0

    for rule, info in rule_matches.items():
        if info['match'] == 'yes':
            matched_rules += 1
            rule_info = rules_dict[rule]
            rule_context.append(
                f"Example: {rule_info['example']}\n"
                f"Template: {rule_info['template']}"
            )
            rule_score += patterns[rule][1]

    rule_context = rule_context[:3]  # keep top-3
    rule_context_str = "\n\n".join(rule_context) if rule_context else ""
    rule_score = (rule_score / max(1, matched_rules)) if matched_rules > 0 else 0.0

    return rule_context_str, rule_score, matched_rules > 0

In [10]:
# Metric-based Retieval

def retrieve_with_threshold(query, rule_weight=0.5, similarity_weight=0.5):
    # Rule-based
    rule_context, rule_score, rules_matched = rule_based_retrieval(query)

    # Similarity-based
    similarity_context, similarity_score, similarity_found = similarity_retrieval(query)

    # Compare
    combined_score_rule = rule_weight * rule_score
    combined_score_similarity = similarity_weight * similarity_score

    if combined_score_rule > combined_score_similarity and rules_matched:
        selected_context = rule_context
        context_source = "Rule-Based"
    elif combined_score_similarity >= combined_score_rule and similarity_found:
        selected_context = similarity_context
        context_source = "Similarity-Based"
    else:
        selected_context = (
            f"### Rule-Based Context:\n{rule_context}\n\n"
            f"### Similarity-Based Context:\n{similarity_context}"
        )
        context_source = "Combined"

    return selected_context, {
        "rule_score": rule_score,
        "similarity_score": similarity_score,
        "context_source": context_source,

    }

**Step 3: Generation**

In [None]:
# Load model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("unsloth/mistral-7b-instruct-v0.3-bnb-4bit")
model = AutoModelForCausalLM.from_pretrained(
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    torch_dtype=torch.float16
).to(device)
FastLanguageModel.for_inference(model)
EOS_TOKEN = tokenizer.eos_token

In [22]:
# Inference

"""Inference"""
prompt = """You are a log parsing assistant. Your task is to extract the template of the given log message "
            "by replacing dynamic parts like timestamps, IDs, IP addresses, or numeric values with the '<*>' placeholder. "
            "Use the examples in the context to guide your output. "
            "If the log message is fully static and has no dynamic parts, return it as-is with no placeholders. "
            "Return ONLY the extracted template with no explanations, reasoning, or additional text."
### Context:
{selected_context}

### Log Message to Parse:
```
{log}
```

### Response:
"""

def extract_log_templates(new_logs, retriever, tokenizer, model, device):
    results = []
    for idx, row in enumerate(new_logs.itertuples(), start=1):
        log = row.Content if isinstance(row.Content, str) else str(row.Content)
        category = row.Category
        source = row.Source

        # retriever returns TWO things
        selected_context, scores = retriever(log)

        formatted_prompt = prompt.format(
            selected_context=selected_context, log=log
        ) + EOS_TOKEN

        print(f"\nProcessing Log {idx}...")
        print("=" * 100)
        print(f"**Log Message:** {log}")
        print(f"**Context Source:** {scores['context_source']}")
        print(f"**Scores:** Rule={scores['rule_score']:.2f}, Similarity={scores['similarity_score']:.2f}")
        print(f"**LLM Input Prompt:**\n{formatted_prompt}\n")

        inputs = tokenizer([formatted_prompt], return_tensors='pt').to(device)
        start_time = time.time()
        outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
        elapsed_time = time.time() - start_time

        output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        llm_response = output_text.split("### Response:")[-1].strip()
        extracted_template = llm_response.replace("`", "").strip()
        extracted_template = (
            extracted_template.split(EOS_TOKEN)[0].strip()
            if EOS_TOKEN in extracted_template else extracted_template
        )
        if extracted_template.startswith("Template:"):
            extracted_template = extracted_template[len("Template:"):].strip()
        elif extracted_template.startswith("template:"):
            extracted_template = extracted_template[len("template:"):].strip()

        print(f"**LLM Response:**\n{llm_response}\n")
        print(f"**Extracted Template:** {extracted_template}")
        print(f"Inference Time: {elapsed_time:.2f} seconds")
        print("=" * 100)

        results.append({
            "Log": log,
            "Extracted Template": extracted_template,
            "Category": category,
            "Source": source,
            "Context_Source": scores['context_source'],
            "Rule_Score": scores['rule_score'],
            "Similarity_Score": scores['similarity_score'],
        })
    return results


In [None]:

# Run the Pipeline

def process_datasets(dataset, retriever, tokenizer, model, device):
    results = extract_log_templates(dataset, retriever, tokenizer, model, device)
    df = pd.DataFrame(results)
    df.to_csv("MetRAG_Results.csv", index=False) # Save Results Dataset
    print("Extracted templates saved successfully.")

# Run the pipeline
start_time = time.time()  # Start timer
process_datasets(TestLogs_Cols, retrieve_with_threshold, tokenizer, model, device)
end_time = time.time()  # End timer
total_inference_time = end_time - start_time # Inference Time
print(f"Total inference time: {total_inference_time:.2f} seconds")