<center><b><font size=6>Data Pre-Processing <b><center>

<center><font size=5>    Preprocessing and Normalization<center>


The analysis was performed for different repetition IDs, each corresponding to a separate dataset. 
These datasets were acquired locally on the PC, with one dataset for each repetition ID.

The preprocessing steps involve:
1. Cleaning the data
2. Filtering the data
3. Identifying the relevant TCP ports
4. Grouping the data
5. Normalization

Finally, all preprocessed datasets corresponding to different repetition IDs are concatenated into a single DataFrame, 
which will be used for further analysis.


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
def process_tcp_stream_analysis(file_names):
    """
    Objectives:
        Step 1: Integrating the features: tcp_srcport and tcp_dstport
        Step 2: Remove columns with source and destination ip addresses
        Step 3: Add new features / Remove unused features
        Step 4: Drop the samples with NaN or infinit values
        Step 5: Reduce the size of dataset by taking the mean value of the features in each group <borowser> <Website> <query> <repetition_id>

        Parameters:
        - file_names: A list of the csv file names in this format ["Merged_TCP_Stream_Analysis_<i>.csv"]. for i being the repetition id of the collected dataset

        Returns:
        - final_df: pandas.DataFrame, the DataFrame with standardized numerical columns.
    """

    # Define the function to identify tcp_port
    def identify_tcp_port(row):
        if int(row['tcp_srcport_mode']) not in [443, 80]:
            return row['tcp_srcport_mode']
        elif int(row['tcp_dstport_mode']) not in [443, 80]:
            return row['tcp_dstport_mode']
        else:
            return None
        
    # Initialize an empty list to store DataFrames
    dfs = []

    # Loop through each file name
    for file_name in file_names:
        # Load the Dataset
        df = pd.read_csv(file_name).dropna().reset_index(drop=True)
        
        ## (Step 1) ##
        #Drop samples where src-port equals dst-port
        filtered_df = df[df['tcp_srcport_mode'] != df['tcp_dstport_mode']].copy()

        # Create the tcp_port column based on the given conditions
        filtered_df.loc[:, 'tcp_port'] = filtered_df.apply(identify_tcp_port, axis=1)

        # Drop rows where tcp_port is None (cases where both ports are 443 or 80)
        filtered_df = filtered_df.dropna(subset=['tcp_port'])

        # Remove the columns 'tcp_srcport_mode' and 'tcp_dstport_mode'
        filtered_df.drop(columns=['tcp_srcport_mode', 'tcp_dstport_mode'], inplace=True)
        
        ## (Step 2) ##
        # Remove columns 'ip_src_mode' and 'ip_dst_mode'
        if 'ip_src_mode' in filtered_df.columns or 'ip_dst_mode' in filtered_df.columns:
            filtered_df.drop(columns=['ip_src_mode', 'ip_dst_mode'], errors='ignore', inplace=True)

        ## (Step 3) ## 
        #Count the number of streams for each combination of <browser>, <website>, and <query>
        stream_counts = filtered_df.groupby(['browser', 'website', 'query']).size().reset_index(name='stream_count')

        # Merge the counts back into the original DataFrame
        filtered_df = pd.merge(filtered_df, stream_counts, on=['browser', 'website', 'query'], how='left')

        # Remove the column 'tcp.stream'
        if 'tcp.stream' in filtered_df.columns:
            filtered_df.drop(columns=['tcp.stream'], inplace=True)

        # Remove the column 'unique_tcp_flags'
        if 'unique_tcp_flags' in filtered_df.columns:
            filtered_df.drop(columns=['unique_tcp_flags'], inplace=True)
            
        ## (Step4) ##
        # Handle NaN and infinity values
        filtered_df = filtered_df.replace([np.inf, -np.inf], np.nan).dropna().reset_index(drop=True)

        ## (Step 5) ##
        #Group by 'browser', 'website', 'query' and calculate the mean of all features
        grouped_means = filtered_df.groupby(['browser', 'website', 'query', 'repetition_id']).mean().reset_index()

        
        # Append the result to the list
        dfs.append(grouped_means)

    # Concatenate all DataFrames in the list into a single DataFrame
    final_df = pd.concat(dfs, ignore_index=True)

    return final_df

#-------------------------------------------------------------------------------------------------------------------------------------------------

def standardize_dataframe(df, numerical_cols):
    """
    Standardizes the specified numerical columns in the given DataFrame after dropping rows with NaN or infinite values.
    
    Parameters:
    - df: pandas.DataFrame, the input DataFrame.
    - numerical_cols: list of str, column names to be standardized.
    - output_file: str, the name of the file where the standardized DataFrame will be saved.
    
    Returns:
    - final_df: pandas.DataFrame, the DataFrame with standardized numerical columns.
    """
    # Drop rows with NaN or infinite values
    df_cleaned = df.replace([np.inf, -np.inf], np.nan).dropna()
    
    # Separate the numerical and non-numerical columns
    non_numerical_cols = df_cleaned.columns.difference(numerical_cols)
    
    # Standardize the numerical columns
    scaler = StandardScaler()
    standardized_data = scaler.fit_transform(df_cleaned[numerical_cols])
    
    # Create a DataFrame with the standardized data
    standardized_df = pd.DataFrame(standardized_data, columns=numerical_cols)
    
    # Add the non-numerical columns back to the DataFrame
    final_df = pd.concat([df_cleaned[non_numerical_cols], standardized_df], axis=1)
    
    return final_df


In [3]:
# Pre Process and label the dataset
file_names = ["Merged_TCP_Stream_Analysis_50.csv", "Merged_TCP_Stream_Analysis_60.csv", "Merged_TCP_Stream_Analysis_70.csv", "Merged_TCP_Stream_Analysis_80.csv", "Merged_TCP_Stream_Analysis_90.csv", "Merged_TCP_Stream_Analysis_100.csv"]
df = process_tcp_stream_analysis(file_names)

# Save the grouped means dataframe to a CSV file
df.to_csv('df.csv', index=False)


In [4]:
# Selecting numerical columns for standardization
numerical_cols = ['frame_len_mean', 'frame_len_std', 'tcp_len_mean', 'tcp_len_std',
                  'tcp_window_size_value_mean', 'tcp_window_size_value_std', 'tcp_seq_mean', 
                  'tcp_seq_std', 'ip_ttl_mean', 'ip_ttl_std', 'tcp_ack_mean', 'tcp_ack_std', 
                  'packet_count', 'tcp_port', 'stream_count']

# Selecting non-numerical columns to keep
non_numerical_cols = ['browser', 'website', 'query', 'repetition_id']


In [5]:
#Normalize the data. 
#You can select a specific part of the dataset to normalize and use. 
df_standard = standardize_dataframe(df, numerical_cols)
#df_standard = standardize_dataframe(df.loc[df['repetition_id'] == 60], numerical_cols)

# Reorder the columns
ordered_columns = ['browser', 'website', 'query', 'repetition_id'] + [col for col in df_standard.columns if col not in ['browser', 'website', 'query', 'repetition_id']]
df_standard = df_standard[ordered_columns]

# Save the grouped means dataframe to a CSV file
df_standard.to_csv('df_standard.csv', index=False)