### Overview  

This notebook demonstrates the process of extracting event count vectors from the log file.  
These vectors serve as the input data for the **count vector clustering model**.  


In [1]:
import pandas as pd
import re
from collections import defaultdict

## Structure of the parsed data from the raw log file.

In [6]:
df  = pd.read_csv('C:/Users/naren/Industry Project/Demo/HDFS_100k.log_structured.csv')
df.head()

Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,E5,Receiving block <*> src: /<*> dest: /<*>
1,2,81109,203518,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.allocateBlock: /mnt/hadoop/m...,E22,BLOCK* NameSystem.allocateBlock:<*>
2,3,81109,203519,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,E5,Receiving block <*> src: /<*> dest: /<*>
3,4,81109,203519,145,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,E5,Receiving block <*> src: /<*> dest: /<*>
4,5,81109,203519,145,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_-1608999687919...,E11,PacketResponder <*> for block <*> terminating


## Further Parsing

We aim to extract the sequence of events occurring in each block. 

- First, we extract the block IDs from the **Content** section.
- Then, we retrieve the corresponding **Event IDs** for each block ID.
- Once we obtain the sequence of events for each block, we create **event count vectors**.  
  - Each vector index corresponds to a specific event type.  
  - The value at each index represents the number of times that event type appears in the sequence.


##  Create a new DataFrame with Block_ID and cleaned EventId

In [7]:
# Regular expression to extract block IDs
block_pattern = re.compile(r"blk_[-]?\d+")

# Extract Block_ID from Content using regex and get the first match (if any)
df['Block_ID'] = df['Content'].apply(lambda x: block_pattern.findall(x)[0] if block_pattern.findall(x) else None)

# Remove 'E' from EventId
df['EventId'] = df['EventId'].str.replace('E', '', regex=False)

# Create a new DataFrame with Block_ID and cleaned EventId
result_df = df[['Block_ID', 'EventId']]

# Display the result
result_df


Unnamed: 0,Block_ID,EventId
0,blk_-1608999687919862906,5
1,blk_-1608999687919862906,22
2,blk_-1608999687919862906,5
3,blk_-1608999687919862906,5
4,blk_-1608999687919862906,11
...,...,...
104810,blk_-8738709778586756237,2
104811,blk_6420476111425645508,2
104812,blk_-2382389751032389929,2
104813,blk_4856031730010032819,2


## Grouping by BlockId's and getting the sequence of events happening in each block.

In [8]:
# Dictionary to store EventId sequences for each block
block_event_mapping = defaultdict(list)

# Regular expression to extract block IDs
block_pattern = re.compile(r"blk_[-]?\d+")

# Process each row in the log data
for _, row in df.iterrows():
    content = row["Content"]
    event_id = row["EventId"]
    
    # Extract block IDs from content
    block_ids = block_pattern.findall(str(content))
    
    # Assign EventId to corresponding block(s)
    for block_id in block_ids:
        block_event_mapping[block_id].append(str(event_id).replace("E", ""))  # Remove "E" prefix for numerical format

# Format the output to match count vector clustering requirements
formatted_output = "\n".join(f"{block},{' '.join(events)}" for block, events in block_event_mapping.items())

# Split the string by newline to separate rows
rows = formatted_output.strip().split('\n')

# Split each row by the comma to create two columns
data = [row.split(',') for row in rows]

# Create DataFrame
df = pd.DataFrame(data, columns=["Block_ID", "Event_ID"])

df.head(10)


Unnamed: 0,Block_ID,Event_ID
0,blk_-1608999687919862906,5 22 5 5 11 11 9 9 11 9 26 26 26 6 5 16 6 5 18...
1,blk_7503483334202473044,5 5 22 5 11 9 11 9 11 9 26 26 26 3 2 2
2,blk_-3544583377289625738,5 22 5 5 11 9 11 9 11 9 3 26 26 26 3 3 3 3 3 3...
3,blk_-9073992586687739851,5 22 5 5 11 9 11 9 11 9 26 26 26 2 2 2
4,blk_7854771516489510256,5 5 22 5 11 9 11 9 11 9 26 26 26 2 2 2
5,blk_1717858812220360316,5 5 22 5 11 9 11 9 11 9 26 26 26 2 2 2
6,blk_-2519617320378473615,5 22 5 5 11 11 9 9 11 9 26 26 26 2 2
7,blk_7063315473424667801,5 5 5 22 11 9 11 9 26 26 11 9 26 2
8,blk_8586544123689943463,5 5 5 22 11 9 11 11 9 9 26 26 26 2 2 2
9,blk_2765344736980045501,5 5 22 5 11 9 11 9 26 11 9 26 26 2 2 2


In [9]:
df.shape

(7940, 2)

In [10]:
# Split the event sequences into individual event values
df['EventSequence'] = df['Event_ID'].apply(lambda x: tuple(x.split()))

# Count the occurrences of each event sequence
event_counts = df['EventSequence'].value_counts()

# Convert to a DataFrame for better readability
result_df = pd.DataFrame(event_counts).reset_index()
result_df.columns = ['EventSequence', 'Count']

# Display the result
result_df

Unnamed: 0,EventSequence,Count
0,"(22, 5, 5, 5, 26, 26, 26, 11, 9, 11, 9, 11, 9)",2242
1,"(22, 5, 5, 5, 26, 26, 11, 9, 11, 9, 11, 9, 26)",617
2,"(22, 5, 5, 5, 26, 26, 11, 9, 11, 9, 26, 11, 9)",426
3,"(5, 22, 5, 5, 11, 9, 11, 9, 11, 9, 26, 26, 26)",295
4,"(5, 5, 22, 5, 11, 9, 11, 9, 11, 9, 26, 26, 26)",233
...,...,...
504,"(5, 22, 5, 5, 9, 11, 9, 26, 11, 9, 26, 11, 26)",1
505,"(5, 5, 22, 5, 11, 11, 9, 9, 26, 11, 9, 26, 26)",1
506,"(5, 5, 22, 5, 11, 9, 11, 9, 11, 9, 26, 26, 26,...",1
507,"(5, 22, 5, 5, 26, 26, 11, 11, 9, 9, 11, 9, 26, 2)",1


In [11]:
from collections import Counter

# Create a new column to store event counts as tuples
def count_event_occurrences(event_sequence):
    # Count the occurrences of each event in the sequence
    event_count = Counter(event_sequence)
    # Return a sorted tuple list (event, count)
    return tuple(sorted(event_count.items(), key=lambda x: x[0]))

# Apply the function to the 'EventSequence' column to get the counts
result_df['EventCount'] = result_df['EventSequence'].apply(count_event_occurrences)

# Display the updated DataFrame with event counts
result_df[['EventCount']]

Unnamed: 0,EventCount
0,"((11, 3), (22, 1), (26, 3), (5, 3), (9, 3))"
1,"((11, 3), (22, 1), (26, 3), (5, 3), (9, 3))"
2,"((11, 3), (22, 1), (26, 3), (5, 3), (9, 3))"
3,"((11, 3), (22, 1), (26, 3), (5, 3), (9, 3))"
4,"((11, 3), (22, 1), (26, 3), (5, 3), (9, 3))"
...,...
504,"((11, 3), (22, 1), (26, 3), (5, 3), (9, 3))"
505,"((11, 3), (22, 1), (26, 3), (5, 3), (9, 3))"
506,"((11, 3), (2, 2), (22, 1), (26, 3), (3, 1), (5..."
507,"((11, 3), (2, 1), (22, 1), (26, 3), (5, 3), (9..."


## Creating Event Count Vectors

In [None]:
# Group by EventCount and count occurrences
event_count_df = result_df.groupby('EventCount').size().reset_index(name='Count')

# Rename columns for clarity
event_count_df.columns = ['UniqueEventCount', 'Count']

# Display the result
event_count_df

# Assuming your DataFrame is named `df`
df_sorted = event_count_df.sort_values(by='Count', ascending=False)

Unnamed: 0,UniqueEventCount,Count
13,"((11, 3), (22, 1), (26, 3), (5, 3), (9, 3))",279
7,"((11, 3), (2, 1), (22, 1), (26, 3), (5, 3), (9, 3))",118
9,"((11, 3), (2, 2), (22, 1), (26, 3), (5, 3), (9, 3))",42
10,"((11, 3), (2, 3), (22, 1), (26, 3), (5, 3), (9, 3))",33
4,"((11, 3), (16, 1), (18, 1), (21, 2), (22, 1), (25, 1), (26, 4), (5, 4), (6, 1), (9, 3))",18
3,"((11, 3), (13, 3), (22, 1), (26, 3), (5, 3), (9, 3))",6
15,"((22, 1), (5, 2), (7, 1))",2
14,"((22, 1), (5, 1))",2
0,"((10, 1), (11, 1), (14, 1), (22, 1), (5, 2), (7, 2))",1
11,"((11, 3), (22, 1), (26, 3), (27, 1), (5, 3), (9, 3))",1
