In [None]:
import pandas as pd
import re
import  numpy as np
from concurrent.futures import ThreadPoolExecutor
from sklearn.feature_extraction import FeatureHasher

In [None]:
# This function is used to split the input string by ',', and ignore the ',' inside [] .
def split_attributes(attr_str):    
    parts = re.split(r',(?![^\[\]]*\])', attr_str)
    parts = [part.strip() for part in parts]
    return parts

# Split the content in Attributes parallelly, cause it's quite slow by doing sequentially
def process_attributes_parallel(attr_series):
    with ThreadPoolExecutor() as executor:
        result = list(executor.map(split_attributes, attr_series))
    return result


def split_name(input_str):
    # split the input string by '='
    parts = input_str.split('=', 1)
    if len(parts) > 1:
        return parts[0]  # return the name
    else:                # if the string doesn't have '=', return None
        return None 

def split_value( input_str):
    # split the input string by '='
    parts = input_str.split('=', 1)
    if len(parts) > 1:
        return parts[1]  # return the name
    else:                # if the string doesn't have '=', return None
        return None    

In [None]:
# read the flowmon dataset
df = pd.read_csv('FL-Event-Filtered.csv', sep= ';')

In [None]:
# Turn the content in 'Attributes' into  a list of string 
# It will takes quite a long time doing the below code
df['Attributes'] = process_attributes_parallel(df['Attributes'])

In [None]:
# try to get all the element that has value, which means  at the left of  '=' is name , at the right of '=' is value
union_of_element_name_in_attribute = set()
for current_attributes in df['Attributes']:
    for element in current_attributes:
        element_name = split_name( element )
        union_of_element_name_in_attribute.add( element_name )

# if there is 'None' in the columns             
union_of_element_name_in_attribute.remove( None )
print( union_of_element_name_in_attribute )

In [None]:
# Generate new columns using the union 

newDf = pd.DataFrame()

for current_element_name in union_of_element_name_in_attribute:
    current_list = []
    for current_attribute in df['Attributes']:
        for current_element in current_attribute :
            find_the_element = False
            if current_element_name in current_element :
                current_list.append( split_value(current_element) )
                find_the_element = True
                break
        if find_the_element == False :
            current_list.append( 0 )
    newDf[current_element_name] = current_list

newDf.head(10)




In [None]:
# concat the original df and attributes after splitting
merged_df = pd.concat([df, newDf], axis=1 )
merged_df = merged_df.drop(['Attributes'], axis=1 )
merged_df.head()


In [None]:
# It's a way to solve the problem with two duplicated columns names
merged_df.to_csv( 'temp.csv', index=False )
merged_df = pd.read_csv( 'temp.csv' )


In [None]:
# label the blacklist event as label '1' , and others as '0'
merged_df['label'] = merged_df['Type'].apply(lambda x: 1 if x == 'BLACKLIST' else 0 )
merged_df.head()

In [None]:
# You can see how many columns right now after splitting
num_rows, num_cols = merged_df.shape

print(f"num of rows : {num_rows}")
print(f"num of cols : {num_cols}")



In [None]:
# delete the event whose IP is in the blacklist, but type is not Blacklist
IPs_in_blacklist = set()
for index, row in merged_df.iterrows():
    
    if row['Type'] == 'BLACKLIST':
        IPs_in_blacklist.add( row['Event source'])

condition1 = merged_df['Event source'].isin(IPs_in_blacklist)

condition2 = merged_df['Type'] != 'BLACKLIST'

combine_condition = condition1 & condition2

merged_df = merged_df[~combine_condition]

In [None]:
# drop the features, which is useless or should not be used
merged_df = merged_df.drop(['Event source', 'Type', 'Subtype', 'Detail', 'Detection time', 'Targets', 'Blacklists', 'BlacklistNames', 'TimeStart', 'TimeEnd'  ], axis=1) 



In [96]:
# before we train we should deal with a problem, the non-numrical column
# Find all non-numerical columns
non_numerical_columns = merged_df.select_dtypes(exclude=[np.number]).columns

# Initialize FeatureHasher
hasher = FeatureHasher(input_type='string', n_features=10)

# Apply FeatureHasher to non-numerical columns
for column in non_numerical_columns:
    if column in merged_df.columns:
        # Convert each value to a list containing a single string
        hashed_features = hasher.transform(merged_df[column].astype(str).apply(lambda x: [x])).toarray()
        hashed_df = pd.DataFrame(hashed_features, columns=[f'{column}_hashed_{i}' for i in range(hashed_features.shape[1])])
        
        # Reset index to ensure alignment
        merged_df = merged_df.reset_index(drop=True)
        hashed_df = hashed_df.reset_index(drop=True)
        
        # Merge DataFrame
        merged_df = pd.concat([merged_df, hashed_df], axis=1).drop(columns=[column])
    else:
        print(f"Column {column} does not exist in DataFrame.")


In [98]:
# save the new dataframe to csv
merged_df.to_csv( 'preprocessed_data.csv', index=False )

