#### Imports

In [None]:
import pandas as pd
from collections import defaultdict
import re

#### Data

In [None]:
int_column_names = ['interval_name', 'interval_begin', 'interval_end', 'keys', 'values']
event_column_names = ['event_id', 'timestamp', 'keys', 'values']

intervals = pd.read_csv('lanl_intervals', sep='|', header=None, names=int_column_names)
events = pd.read_csv('lanl_10k.events', sep='|', header=None, names=event_column_names)

spec_data = defaultdict(list)
with open('lanl.nfer', 'r') as file:
    lines = file.readlines()

for line in lines:
    line = line.strip()
    if ":-" in line:
        interval_name, values = line.split(":-")
        interval_name = interval_name.strip()
        values = values.strip()
        spec_data[interval_name].append(values)
        
unique_intervals = intervals['interval_name'].unique().tolist()


#### Example interval instance

In [None]:
instance_id = 3
selected_interval = intervals.iloc[instance_id]

interval_name = selected_interval['interval_name']
interval_begin = selected_interval['interval_begin']
interval_end = selected_interval['interval_end']

#### Retrieve associated rule

In [None]:
keywords = ['during','before']

# Retrieve the rule associated with the selected interval name
rule = spec_data[interval_name][0]

def split_rule_by_keywords(rule, keywords):
    # Escape the keywords for use in regex and create a pattern that matches any of the keywords
    pattern = '|'.join(re.escape(keyword) for keyword in keywords)
    
    
    # Use regex to find all parts of the rule based on the keywords
    parts = re.split(f'\\s*({pattern})\\s*', rule)
    
    
    # Filter out empty strings and keep the relevant parts
    parts = [part for part in parts if part]
#     print(parts)

    # To correctly handle nested parts within parentheses, we will keep track of them
    result = []
    current_part = []

    for part in parts:
        if '(' in part:
            current_part.append(part)
        elif ')' in part:
            current_part.append(part)
            # Join all parts within parentheses and add to result
            result.append(' '.join(current_part))
            current_part = []
        elif current_part:
            current_part.append(part)
        else:
            result.append(part)

    return result

# Split the rule using the function
split_rules  = split_rule_by_keywords(rule, keywords)

#### Retrieve interval creation DS

In [None]:
# Check if an element is an event
def is_event(element):
    match = re.search(r'USREVENT_EVENT-\d+', element)
    return bool(match), match.group(0) if match else None

# Check if an element is an interval
def is_interval(element):
    return element in intervals['interval_name'].unique()

# Retrieve the timestamp for an event
def get_event_timestamp(event_id):
    event_row = events[events['event_id'] == event_id]
    if not event_row.empty:
        return event_row.iloc[0]['timestamp']
    return None

# Retrieve instances of an interval from the interval dataframe
def get_interval_instances(interval_name):
    return intervals[intervals['interval_name'] == interval_name]

def validate_syntax(i1_id, i1_start, i1_end, i2, i2_id, i_start, i_end, syntax):
    """
    Dynamically check if i2 is an event or an interval set and apply the syntax.
    Return the valid instances of i2 (and i1) if found.
    """
    valid_instances = []  # Collect valid instances of i2

    if isinstance(i2, tuple):  # i2 is an event (single start and end time)
        i2_start, i2_end = i2  # Events have the same start and end times
        
        if syntax == 'before':
            if i1_end < i2_start and i_start == i1_start and i_end == i2_end:
                valid_instances.append({'i1': {'id': i1_id, 'start': i1_start, 'end': i1_end}, 
                                        'i2': {'id': i2_id, 'start': i2_start, 'end': i2_end}})
        elif syntax == 'during':
            if (i1_start >= i2_start and i1_end <= i2_end and
                i_start == i2_start and i_end == i2_end):
                valid_instances.append({'i1': {'id': i1_id, 'start': i1_start, 'end': i1_end}, 
                                        'i2': {'id': i2_id, 'start': i2_start, 'end': i2_end}})
        # Add other syntaxes here as needed

    else:  # i2 is a set of intervals, so iterate through each instance
        for _, i2_row in i2.iterrows():
            i2_start = i2_row['interval_begin']
            i2_end = i2_row['interval_end']
            
            if syntax == 'before':
                if i1_end < i2_start and i_start == i1_start and i_end == i2_end:
                    valid_instances.append({'i1': {'id': i1_id, 'start': i1_start, 'end': i1_end}, 
                                            'i2': {'id': i2_id, 'start': i2_start, 'end': i2_end}})
            elif syntax == 'during':
                if (i1_start >= i2_start and i1_end <= i2_end and
                    i_start == i2_start and i_end == i2_end):
                    valid_instances.append({'i1': {'id': i1_id, 'start': i1_start, 'end': i1_end}, 
                                            'i2': {'id': i2_id, 'start': i2_start, 'end': i2_end}})
            # Add other syntaxes here as needed

    return valid_instances if valid_instances else None  # Return valid instances or None

# Process the rule with dynamic assignment of i1 (event) and i2 (event or interval set)
def process_general_rule(final_rule, interval_begin=None, interval_end=None):
    i1, i2, syntax = None, None, None

    if len(final_rule) == 1:
        # Case 1: Single event (e.g., ['e:USREVENT_EVENT-11 map { pid -> e.pid }'])
        is_event_result, event_id = is_event(final_rule[0])
        if is_event_result:
            event_timestamp = get_event_timestamp(event_id)
            if event_timestamp is None:
                return None  # Invalid event
            i1 = (event_id, event_timestamp, event_timestamp)  # Events have same start and end
            return {'i1': {'id': event_id, 'start': i1[1], 'end': i1[2]}}  # Return the event ID with timestamps

    for idx, element in enumerate(final_rule):
        matched, event_name = is_event(element)
        if matched:
            # Handle event
            event_timestamp = get_event_timestamp(element)
            if event_timestamp is None:
                return None  # Invalid event
            i_start = i_end = event_timestamp  # Events have same start and end
            if i1 is None:
                i1 = (element, i_start, i_end)  # Store event ID, start, and end
            else:
                i2 = (element, i_start, i_end)  # Store second event ID, start, and end
        elif is_interval(element):
            # Handle interval
            i2 = get_interval_instances(element)
            if i2.empty:
                return None  # No valid interval instances found
            i2_id = element  # Store interval ID (name)
        else:
            # Assume element is a syntax
            syntax = element

        # Case 2: Two events or event and interval with a syntax
        if i1 and i2 is not None and syntax:
            i1_id, i1_start, i1_end = i1
            valid_instances = validate_syntax(i1_id, i1_start, i1_end, i2, i2_id, interval_begin, interval_end, syntax)
            if valid_instances:
                return valid_instances  # Return the valid instances with IDs
            else:
                return None  # Syntax validation failed

    return None

print("INTERVAL NAME:{},INTERVAL BEGIN:{},INTERVAL END:{}".format(interval_name,interval_begin, interval_end))
print("ASSOCIATED RULE:{}".format(split_rules))
print("EVENT AND INTERVAL RESPONSIBLE:")
valid_event_data = process_general_rule(split_rules, interval_begin, interval_end)