## ZeekAIAnalyzer
This tool is designed to analyze Zeek Conn Logs and look for anomalous spikes in traffic, potential beaconing, and other behavior. This tool uses Azure OpenAI to help analyze the logs and provide a summary of the behavior.

In [None]:
## Enhanced ZeekAIAnalyzer - Multi-Log Support
## Supports conn.log, http.log, and dns.log analysis
## Add Chain of Thought (CoT) Reasoning
## Add Known-Bad Traffic to Reference if possible

import pandas as pd
from openai import AzureOpenAI
import os
import json

def detect_log_type(df):
    """Detect the type of Zeek log based on column names"""
    columns = set(df.columns)
    
    # Check for conn.log specific columns
    conn_columns = {'id.orig_h', 'id.resp_h', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state'}
    if conn_columns.issubset(columns):
        return 'conn'
    
    # Check for http.log specific columns - updated to match actual log format
    http_columns = {'method', 'host', 'uri', 'user_agent', 'status_code', 'status_msg'}
    if http_columns.issubset(columns):
        return 'http'
    
    # Check for dns.log specific columns - updated to match actual log format  
    dns_columns = {'query', 'qclass', 'qtype', 'rcode', 'answers', 'TTLs'}
    if dns_columns.issubset(columns):
        return 'dns'
    
    return 'unknown'

def analyze_conn_log(df):
    """Analyze conn.log data"""
    summary_stats = {
        "log_type": "Connection Log (conn.log)",
        "total_records": len(df),
        "unique_source_ips": df['id.orig_h'].nunique(),
        "unique_destination_ips": df['id.resp_h'].nunique(),
        "top_source_ips": df['id.orig_h'].value_counts().head(5).to_dict(),
        "top_destination_ips": df['id.resp_h'].value_counts().head(5).to_dict(),
        "top_ports": df['id.resp_p'].value_counts().head(10).to_dict() if 'id.resp_p' in df.columns else {},
        "connection_states": df['conn_state'].value_counts().to_dict() if 'conn_state' in df.columns else {},
        "duration_stats": df['duration'].describe().to_dict() if 'duration' in df.columns else {},
        "orig_bytes_stats": df['orig_bytes'].describe().to_dict() if 'orig_bytes' in df.columns else {},
        "resp_bytes_stats": df['resp_bytes'].describe().to_dict() if 'resp_bytes' in df.columns else {}
    }
    
    prompt = f"""
You are a network traffic security analyst analyzing Zeek connection logs. Focus on identifying:
- Beaconing behavior (regular intervals, consistent data sizes)
- Anomalous traffic patterns
- Suspicious connection states
- Port scanning activities
- Data exfiltration patterns

Connection Log Analysis:
- Total records: {summary_stats['total_records']}
- Unique source IPs: {summary_stats['unique_source_ips']}
- Unique destination IPs: {summary_stats['unique_destination_ips']}
- Top source IPs by frequency: {summary_stats['top_source_ips']}
- Top destination IPs by frequency: {summary_stats['top_destination_ips']}
- Top destination ports: {summary_stats['top_ports']}
- Connection states distribution: {summary_stats['connection_states']}
- Duration statistics: {summary_stats['duration_stats']}
- Originator bytes statistics: {summary_stats['orig_bytes_stats']}
- Responder bytes statistics: {summary_stats['resp_bytes_stats']}

Please provide a detailed analysis focusing on potential security threats, beaconing indicators, and anomalous behaviors.
"""
    return prompt

def analyze_http_log(df):
    """Analyze http.log data"""
    summary_stats = {
        "log_type": "HTTP Log (http.log)",
        "total_records": len(df),
        "unique_source_ips": df['id.orig_h'].nunique() if 'id.orig_h' in df.columns else 0,
        "unique_hosts": df['host'].nunique() if 'host' in df.columns else 0,
        "http_methods": df['method'].value_counts().to_dict() if 'method' in df.columns else {},
        "status_codes": df['status_code'].value_counts().to_dict() if 'status_code' in df.columns else {},
        "status_messages": df['status_msg'].value_counts().to_dict() if 'status_msg' in df.columns else {},
        "top_hosts": df['host'].value_counts().head(10).to_dict() if 'host' in df.columns else {},
        "top_user_agents": df['user_agent'].value_counts().head(5).to_dict() if 'user_agent' in df.columns else {},
        "top_uris": df['uri'].value_counts().head(10).to_dict() if 'uri' in df.columns else {},
        "top_referrers": df['referrer'].value_counts().head(5).to_dict() if 'referrer' in df.columns else {},
        "response_mime_types": df['resp_mime_types'].explode().value_counts().to_dict() if 'resp_mime_types' in df.columns else {},
        "unique_destination_ips": df['id.resp_h'].nunique() if 'id.resp_h' in df.columns else 0,
        "top_destination_ports": df['id.resp_p'].value_counts().head(5).to_dict() if 'id.resp_p' in df.columns else {}
    }
    
    prompt = f"""
You are a web traffic security analyst analyzing Zeek HTTP logs. Focus on identifying:
- Malicious web requests and responses
- Command and control (C2) communication
- Data exfiltration via HTTP
- Suspicious user agents
- Anomalous HTTP methods or status codes
- Potential web attacks (SQL injection, XSS, etc.)

HTTP Log Analysis:
- Total records: {summary_stats['total_records']}
- Unique source IPs: {summary_stats['unique_source_ips']}
- Unique destination IPs: {summary_stats['unique_destination_ips']}
- Unique hosts contacted: {summary_stats['unique_hosts']}
- HTTP methods distribution: {summary_stats['http_methods']}
- Status codes distribution: {summary_stats['status_codes']}
- Status messages distribution: {summary_stats['status_messages']}
- Top contacted hosts: {summary_stats['top_hosts']}
- Top user agents: {summary_stats['top_user_agents']}
- Top URIs accessed: {summary_stats['top_uris']}
- Top referrers: {summary_stats['top_referrers']}
- Response MIME types: {summary_stats['response_mime_types']}
- Top destination ports: {summary_stats['top_destination_ports']}

Please provide a detailed analysis focusing on potential web-based threats, C2 communications, and suspicious HTTP behaviors.
"""
    return prompt

def analyze_dns_log(df):
    """Analyze dns.log data"""
    summary_stats = {
        "log_type": "DNS Log (dns.log)",
        "total_records": len(df),
        "unique_source_ips": df['id.orig_h'].nunique() if 'id.orig_h' in df.columns else 0,
        "unique_destination_ips": df['id.resp_h'].nunique() if 'id.resp_h' in df.columns else 0,
        "unique_queries": df['query'].nunique() if 'query' in df.columns else 0,
        "query_classes": df['qclass'].value_counts().to_dict() if 'qclass' in df.columns else {},
        "query_types": df['qtype'].value_counts().to_dict() if 'qtype' in df.columns else {},
        "response_codes": df['rcode'].value_counts().to_dict() if 'rcode' in df.columns else {},
        "top_queries": df['query'].value_counts().head(10).to_dict() if 'query' in df.columns else {},
        "top_domains": df['query'].str.extract(r'([^.]+\.[^.]+)$')[0].value_counts().head(10).to_dict() if 'query' in df.columns else {},
        "ttl_stats": df['TTLs'].explode().describe().to_dict() if 'TTLs' in df.columns else {},
        "authoritative_responses": df['AA'].value_counts().to_dict() if 'AA' in df.columns else {},
        "truncated_responses": df['TC'].value_counts().to_dict() if 'TC' in df.columns else {},
        "recursion_desired": df['RD'].value_counts().to_dict() if 'RD' in df.columns else {},
        "recursion_available": df['RA'].value_counts().to_dict() if 'RA' in df.columns else {},
        "rejected_queries": df['rejected'].value_counts().to_dict() if 'rejected' in df.columns else {},
        "unique_transaction_ids": df['trans_id'].nunique() if 'trans_id' in df.columns else 0
    }
    
    prompt = f"""
You are a DNS traffic security analyst analyzing Zeek DNS logs. Focus on identifying:
- DNS tunneling activities
- Domain Generation Algorithm (DGA) domains
- DNS beaconing patterns
- Suspicious domain queries
- DNS over HTTPS (DoH) or DNS over TLS (DoT) usage
- Fast flux domains
- Typosquatting attempts

DNS Log Analysis:
- Total records: {summary_stats['total_records']}
- Unique source IPs: {summary_stats['unique_source_ips']}
- Unique destination IPs (DNS servers): {summary_stats['unique_destination_ips']}
- Unique DNS queries: {summary_stats['unique_queries']}
- Unique transaction IDs: {summary_stats['unique_transaction_ids']}
- Query classes distribution: {summary_stats['query_classes']}
- Query types distribution (numeric): {summary_stats['query_types']}
- Response codes distribution (numeric): {summary_stats['response_codes']}
- Top DNS queries: {summary_stats['top_queries']}
- Top domains queried: {summary_stats['top_domains']}
- TTL statistics: {summary_stats['ttl_stats']}
- Authoritative responses (AA flag): {summary_stats['authoritative_responses']}
- Truncated responses (TC flag): {summary_stats['truncated_responses']}
- Recursion desired (RD flag): {summary_stats['recursion_desired']}
- Recursion available (RA flag): {summary_stats['recursion_available']}
- Rejected queries: {summary_stats['rejected_queries']}

Note: Query types and response codes are in numeric format (qtype: 1=A, 2=NS, 5=CNAME, etc.; rcode: 0=NOERROR, 1=FORMERR, 2=SERVFAIL, 3=NXDOMAIN, etc.)

Please provide a detailed analysis focusing on potential DNS-based threats, tunneling activities, and suspicious domain behaviors.
"""
    return prompt

# Main execution
# Step 1: Configuration - specify your log file here
log_file = input("Enter the path to your Zeek log file (conn.log, http.log, or dns.log): ").strip()

if not os.path.exists(log_file):
    print(f"Error: File '{log_file}' not found. Please check the file path.")
else:
    # Step 2: Load the Zeek log JSON file
    try:
        df = pd.read_json(log_file, lines=True)
        print(f"Successfully loaded {len(df)} records from {log_file}")
        
        # Step 3: Detect log type and analyze accordingly
        log_type = detect_log_type(df)
        print(f"Detected log type: {log_type}")
        
        if log_type == 'conn':
            analysis_prompt = analyze_conn_log(df)
        elif log_type == 'http':
            analysis_prompt = analyze_http_log(df)
        elif log_type == 'dns':
            analysis_prompt = analyze_dns_log(df)
        else:
            print(f"Warning: Unknown log type. Available columns: {list(df.columns)}")
            analysis_prompt = f"""
            Unknown Zeek log type with {len(df)} records.
            Available columns: {list(df.columns)}
            Please analyze this data and identify any potential security concerns.
            """
        
        # Step 4: Initialize the AzureOpenAI client
        client = AzureOpenAI(
            api_key="",
            api_version="",
            azure_endpoint=""
        )
        
        # Step 5: Request analysis from Azure OpenAI
        print("\nAnalyzing log data with AI...")
        response = client.chat.completions.create(
            model="HogwartsDeployment",
            messages=[
                {"role": "system", "content": "You are an expert cybersecurity analyst specializing in network traffic analysis and threat detection."},
                {"role": "user", "content": analysis_prompt}
            ],
            temperature=0.3,
            max_tokens=2000
        )
        
        # Step 6: Print the analysis
        print("\n" + "="*80)
        print("ZEEK LOG ANALYSIS REPORT")
        print("="*80)
        print(response.choices[0].message.content)
        print("="*80)
        
    except Exception as e:
        print(f"Error processing log file: {str(e)}")
        print("Make sure the file is a valid Zeek JSON log file.")