In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
#pip install git+https://github.com/KindXiaoming/pykan.git
import sys

In [38]:
'''import os
import pandas as pd

test_data_path = os.getenv("TEST_DATA_PATH", "data/UNSW_NB15_testing-set.csv")
train_data_path = os.getenv("TRAIN_DATA_PATH", "data/UNSW_NB15_training-set.csv")

test_data = pd.read_csv(test_data_path)
train_data = pd.read_csv(train_data_path)'''

'import os\nimport pandas as pd\n\ntest_data_path = os.getenv("TEST_DATA_PATH", "data/UNSW_NB15_testing-set.csv")\ntrain_data_path = os.getenv("TRAIN_DATA_PATH", "data/UNSW_NB15_training-set.csv")\n\ntest_data = pd.read_csv(test_data_path)\ntrain_data = pd.read_csv(train_data_path)'

In [39]:
test_data = pd.read_csv(r"UNSW_NB15_testing-set.csv")
train_data = pd.read_csv(r"UNSW_NB15_training-set.csv")

In [40]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierTreatmentTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_numeric):
        """
        Parameters:
            columns_numeric (list): List of numeric columns to treat for outliers.
        """
        self.columns_numeric = columns_numeric
        self.iqr_bounds = {}  # Store IQR bounds for each column and class

    def fit(self, X, y=None):
        """
        Compute IQR bounds for each numeric column grouped by "attack_cat".

        Parameters:
            X (pd.DataFrame): The training data.
            y (ignored): Not used, present for compatibility.

        Returns:
            self (OutlierTreatmentTransformer): The fitted transformer.
        """
        print("Fitting outlier treatment...")

        # Ensure all numeric columns are cast to float
        for col in self.columns_numeric:
            if col in X.columns:
                X[col] = X[col].astype(float)
            else:
                raise ValueError(f"Column '{col}' not found in the input data.")

        # Process each numeric column
        for col in self.columns_numeric:
            self.iqr_bounds[col] = {}

            # Group by "attack_cat" to calculate class-specific IQR bounds
            for attack_cat, group in X.groupby("attack_cat"):
                q1 = group[col].quantile(0.25)
                q3 = group[col].quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
                self.iqr_bounds[col][attack_cat] = (lower_bound, upper_bound)

        print("Fitting completed.")
        return self

    def transform(self, X):
        """
        Treat outliers in the dataset using the stored IQR bounds.

        Parameters:
            X (pd.DataFrame): The data to transform.

        Returns:
            pd.DataFrame: The transformed data with outliers treated.
        """
        print("Transforming data...")

        # Ensure all numeric columns are cast to float
        for col in self.columns_numeric:
            if col in X.columns:
                X[col] = X[col].astype(float)
            else:
                raise ValueError(f"Column '{col}' not found in the input data.")

        # Process each numeric column
        for col in self.columns_numeric:
            for attack_cat, bounds in self.iqr_bounds[col].items():
                lower_bound, upper_bound = bounds

                # Get rows for this attack_cat
                class_rows = X[X["attack_cat"] == attack_cat]

                # Identify outliers
                outliers = (class_rows[col] < lower_bound) | (class_rows[col] > upper_bound)

                # Replace outliers with the training class mean
                replacement_mean = X[X["attack_cat"] == attack_cat][col].mean()
                X.loc[class_rows[outliers].index, col] = replacement_mean

        print("Transformation completed.")
        return X

In [45]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
#testing out different approach
class OutlierTreatmentTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        """
        Parameters:
            columns_numeric (list): List of numeric columns to treat for outliers.
        """
        self.iqr_bounds = {}  # Store IQR bounds for each column and class
        self.numeric_columns = []
        
    def fit(self, X, y=None):
        """
        Compute IQR bounds for each numeric column grouped by "attack_cat".

        Parameters:
            X (pd.DataFrame): The training data.
            y (ignored): Not used, present for compatibility.

        Returns:
            self (OutlierTreatmentTransformer): The fitted transformer.
        """
        print("Fitting outlier treatment...")

        # Ensure all numeric columns are cast to float
        columns_numeric = X.select_dtypes(include= "number").drop(columns="id")
        columns_numeric = [
            col for col in numeric_columns
            if not len(train_data[col].value_counts()) < 3
        ]
        
        
        for col in columns_numeric:
            if col in X.columns:
                X[col] = X[col].astype(float)
            else:
                raise ValueError(f"Column '{col}' not found in the input data.")
            self.columns_numeric = columns_numeric

        # Process each numeric column
        for col in columns_numeric:
            self.iqr_bounds[col] = {}

            # Group by "attack_cat" to calculate class-specific IQR bounds
            for attack_cat, group in X.groupby("attack_cat"):
                q1 = group[col].quantile(0.25)
                q3 = group[col].quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
                self.iqr_bounds[col][attack_cat] = (lower_bound, upper_bound)

        print("Fitting completed.")
        return self

    def transform(self, X):
        """
        Treat outliers in the dataset using the stored IQR bounds.

        Parameters:
            X (pd.DataFrame): The data to transform.

        Returns:
            pd.DataFrame: The transformed data with outliers treated.
        """
        print("Transforming data...")

        # Ensure all numeric columns are cast to float
        for col in self.columns_numeric:
            if col in X.columns:
                X[col] = X[col].astype(float)
            else:
                raise ValueError(f"Column '{col}' not found in the input data.")

        # Process each numeric column
        for col in self.columns_numeric:
            for attack_cat, bounds in self.iqr_bounds[col].items():
                lower_bound, upper_bound = bounds

                # Get rows for this attack_cat
                class_rows = X[X["attack_cat"] == attack_cat]

                # Identify outliers
                outliers = (class_rows[col] < lower_bound) | (class_rows[col] > upper_bound)

                # Replace outliers with the training class mean
                replacement_mean = X[X["attack_cat"] == attack_cat][col].mean()
                X.loc[class_rows[outliers].index, col] = replacement_mean

        print("Transformation completed.")
        return X

In [46]:
train_data

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6.0,4.0,258.0,172.0,74.087490,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0,Normal,0
1,2,0.649902,tcp,-,FIN,14.0,38.0,734.0,42014.0,78.473372,...,1.0,2.0,0.0,0.0,0.0,1.0,6.0,0,Normal,0
2,3,1.623129,tcp,-,FIN,8.0,16.0,364.0,13186.0,14.170161,...,1.0,3.0,0.0,0.0,0.0,2.0,6.0,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12.0,12.0,628.0,770.0,13.677108,...,1.0,3.0,1.0,1.0,0.0,2.0,1.0,0,Normal,0
4,5,0.449454,tcp,-,FIN,10.0,6.0,534.0,268.0,33.373826,...,1.0,40.0,0.0,0.0,0.0,2.0,39.0,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,175337,0.000009,udp,dns,INT,2.0,0.0,114.0,0.0,111111.107200,...,13.0,24.0,0.0,0.0,0.0,24.0,24.0,0,Generic,1
175337,175338,0.505762,tcp,-,FIN,10.0,8.0,620.0,354.0,33.612649,...,1.0,2.0,0.0,0.0,0.0,1.0,1.0,0,Shellcode,1
175338,175339,0.000009,udp,dns,INT,2.0,0.0,114.0,0.0,111111.107200,...,3.0,13.0,0.0,0.0,0.0,3.0,12.0,0,Generic,1
175339,175340,0.000009,udp,dns,INT,2.0,0.0,114.0,0.0,111111.107200,...,14.0,30.0,0.0,0.0,0.0,30.0,30.0,0,Generic,1


In [47]:
OutlierTreater = OutlierTreatmentTransformer()

train_data = OutlierTreater.fit_transform(train_data)
test_data = OutlierTreater.transform(test_data)

Fitting outlier treatment...
Fitting completed.
Transforming data...
Transformation completed.
Transforming data...
Transformation completed.


In [11]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class CategoryPruner(BaseEstimator, TransformerMixin):
    def __init__(self, target_column, threshold=6, debug=False):
        """
        Parameters:
            target_column (str): The name of the target column to preserve.
            threshold (int): The maximum number of categories to keep for each feature.
            debug (bool): Whether to print debug information.
        """
        self.threshold = threshold
        self.debug = debug
        self.target_column = target_column
        self.top_categories = {}  # Store top categories for each feature

    def fit(self, X, y=None):
        """
        Identify the top categories for each categorical feature in the training data.

        Parameters:
            X (pd.DataFrame): The training data.
            y (ignored): Not used, present for compatibility.

        Returns:
            self (CategoryPruner): The fitted transformer.
        """
        # Select categorical columns, excluding the target column
        categorical_columns = X.select_dtypes(exclude=[np.number]).drop(columns=[self.target_column], errors='ignore').columns

        # Iterate through categorical columns
        for feature in categorical_columns:
            if self.debug:
                print(f"Processing feature: {feature}")
                print(f"Number of unique categories before reduction: {X[feature].nunique()}")
                print('----------------------------------------------------')

            # Check if the number of unique categories exceeds the threshold
            if X[feature].nunique() > self.threshold:
                # Identify the top categories in the training data
                self.top_categories[feature] = X[feature].value_counts().head(self.threshold).index

        return self

    def transform(self, X):
        """
        Reduce categories in the dataset to the top categories identified during fitting.

        Parameters:
            X (pd.DataFrame): The data to transform.

        Returns:
            pd.DataFrame: The transformed data with reduced categories.
        """
        # Select categorical columns, excluding the target column
        categorical_columns = X.select_dtypes(exclude=[np.number]).drop(columns=[self.target_column], errors='ignore').columns

        # Iterate through categorical columns
        for feature in categorical_columns:
            if feature in self.top_categories:
                # Reduce data to the top categories, replacing others with '-'
                X[feature] = np.where(X[feature].isin(self.top_categories[feature]), X[feature], '-')

        return X

In [25]:
CategoryPruner_object = CategoryPruner(target_column = "attack_cat").fit(train_data)

In [26]:
train_data = CategoryPruner_object.transform(train_data)

In [27]:
test_data = CategoryPruner_object.transform(test_data)